nixfiles/hosts/raspberry-pi5/profiles/rbp-000-add-pi-support.patch

From 31acfdc558652ea480c773f095ab675218af8195 Mon Sep 17 00:00:00 2001
From: Sam Nazarko <email@samnazarko.co.uk>
Date: Tue, 2 Jun 2015 22:56:15 +0100
Subject: [PATCH 01/24] Fix UPower capability detection on Vero

Signed-off-by: Sam Nazarko <email@samnazarko.co.uk>
---
 xbmc/platform/linux/powermanagement/LogindUPowerSyscall.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/xbmc/platform/linux/powermanagement/LogindUPowerSyscall.cpp b/xbmc/platform/linux/powermanagement/LogindUPowerSyscall.cpp
index bd04197a51..7cb68a19b4 100644
--- a/xbmc/platform/linux/powermanagement/LogindUPowerSyscall.cpp
+++ b/xbmc/platform/linux/powermanagement/LogindUPowerSyscall.cpp
@@ -39,8 +39,8 @@ CLogindUPowerSyscall::CLogindUPowerSyscall()
   if (!m_hasUPower)
     CLog::Log(LOGINFO, "LogindUPowerSyscall - UPower not found, battery information will not be available");
 
-  m_canPowerdown = LogindCheckCapability("CanPowerOff");
-  m_canReboot    = LogindCheckCapability("CanReboot");
+  m_canPowerdown = true;
+  m_canReboot    = true;
   m_canHibernate = LogindCheckCapability("CanHibernate");
   m_canSuspend   = LogindCheckCapability("CanSuspend");
 
-- 
2.34.1


From 4ae22b3482359e9ce9a015b61001eb1e9385c19c Mon Sep 17 00:00:00 2001
From: Sam Nazarko <email@samnazarko.co.uk>
Date: Thu, 21 Dec 2017 11:38:02 +0000
Subject: [PATCH 02/24] Add OSMC Helper routines to improve Kodi integration
 with OSMC

Signed-off-by: Sam Nazarko <email@samnazarko.co.uk>
---
 xbmc/CMakeLists.txt |  2 ++
 xbmc/OSMCHelper.cpp | 36 ++++++++++++++++++++++++++++++++++++
 xbmc/OSMCHelper.h   | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)
 create mode 100644 xbmc/OSMCHelper.cpp
 create mode 100644 xbmc/OSMCHelper.h

diff --git a/xbmc/CMakeLists.txt b/xbmc/CMakeLists.txt
index b7c838b3da..3ddcc4da76 100644
--- a/xbmc/CMakeLists.txt
+++ b/xbmc/CMakeLists.txt
@@ -16,6 +16,7 @@ set(SOURCES AutoSwitch.cpp
             LangInfo.cpp
             MediaSource.cpp
             NfoFile.cpp
+            OSMCHelper.cpp
             PasswordManager.cpp
             PlayListPlayer.cpp
             PartyModeManager.cpp
@@ -58,6 +59,7 @@ set(HEADERS AutoSwitch.h
             LockType.h
             MediaSource.h
             NfoFile.h
+	    OSMCHelper.h
             PartyModeManager.h
             PasswordManager.h
             PlayListPlayer.h
diff --git a/xbmc/OSMCHelper.cpp b/xbmc/OSMCHelper.cpp
new file mode 100644
index 0000000000..2b605881b1
--- /dev/null
+++ b/xbmc/OSMCHelper.cpp
@@ -0,0 +1,36 @@
+/*                                                   
+ *      Copyright (C) 2005-2013 Team XBMC            
+ *      http://xbmc.org                              
+ *                                                   
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option) 
+ *  any later version.                                                  
+ *                                                                      
+ *  This Program is distributed in the hope that it will be useful,     
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of      
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the        
+ *  GNU General Public License for more details.                        
+ *                                                                      
+ *  You should have received a copy of the GNU General Public License   
+ *  along with XBMC; see the file COPYING.  If not, see                 
+ *  <http://www.gnu.org/licenses/>.                                     
+ *                                                                      
+ */                                                                     
+// OSMCHelper.cpp: implementation of OSMC helper routines
+//                                                                       
+//////////////////////////////////////////////////////////////////////   
+                                                                         
+extern "C" {                                                            
+#include "OSMCHelper.h"                                                 
+   #if defined(__arm__)                                                 
+     /* Ensure that uname returns arm, or machine model will reflect kernel bitness only */
+     int uname(struct utsname *buf)                                                        
+     {                                                                                     
+       int r;                                                                              
+       r = syscall(SYS_uname, buf);                                                        
+       strcpy(buf->machine, "armv7");                                                        
+       return r;                                                                           
+     }                                                                                     
+   #endif // __arm__                                                                       
+}                                                                                          
diff --git a/xbmc/OSMCHelper.h b/xbmc/OSMCHelper.h
new file mode 100644
index 0000000000..709d21afbe
--- /dev/null
+++ b/xbmc/OSMCHelper.h
@@ -0,0 +1,38 @@
+/*                                                                                         
+ *      Copyright (C) 2005-2013 Team XBMC                                                  
+ *      http://xbmc.org                                                                    
+ *                                                                                         
+ *  This Program is free software; you can redistribute it and/or modify                   
+ *  it under the terms of the GNU General Public License as published by                   
+ *  the Free Software Foundation; either version 2, or (at your option)                    
+ *  any later version.                                                                     
+ *                                                                                         
+ *  This Program is distributed in the hope that it will be useful,                        
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of                         
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.                                           
+ *                                                                                         
+ *  You should have received a copy of the GNU General Public License                      
+ *  along with XBMC; see the file COPYING.  If not, see                                    
+ *  <http://www.gnu.org/licenses/>.                                                        
+ *                                                                                         
+ */                                                                                        
+// OSMCHelper.h: routines to improve behaviour of Kodi on OSMC                             
+//                                                                                         
+//////////////////////////////////////////////////////////////////////                     
+                                                                                           
+#pragma once                                                                               
+                                                                                           
+#include <sys/syscall.h>                                                                   
+#include <sys/types.h>                                                                     
+#include <sys/utsname.h>                                                                   
+#include <unistd.h>                                                                        
+#include <stdio.h>                                                                         
+#include <string.h>                                                                        
+                                                                                           
+extern "C" {                                                                               
+    #if defined(__arm__)                                                                   
+      /* Fix up uname for 64-bit kernels with 32-bit userland */                           
+      int uname(struct utsname *buf);                                                      
+    #endif // __arm__                                                                      
+}                                                                                          
-- 
2.34.1


From 74b81e0fe134756e283f93b2acd20fada7bdc545 Mon Sep 17 00:00:00 2001
From: Sam Nazarko <email@samnazarko.co.uk>
Date: Sat, 6 Mar 2021 18:29:22 +0000
Subject: [PATCH 03/24] Enable DRM-PRIME for Raspberry Pi

Signed-off-by: Sam Nazarko <email@samnazarko.co.uk>
---
 system/settings/linux.xml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/system/settings/linux.xml b/system/settings/linux.xml
index 531974f3f4..89b91db23b 100644
--- a/system/settings/linux.xml
+++ b/system/settings/linux.xml
@@ -165,12 +165,12 @@
           <requirement>HAS_GLES</requirement>
           <visible>false</visible>
           <level>2</level>
-          <default>false</default>
+          <default>true</default>
           <control type="toggle" />
         </setting>
         <setting id="videoplayer.useprimedecoderforhw" type="boolean" parent="videoplayer.useprimedecoder" label="13438" help="36172">
           <requirement>HAS_GLES</requirement>
-          <visible>false</visible>
+          <visible>true</visible>
           <dependencies>
             <dependency type="enable">
               <condition setting="videoplayer.useprimedecoder" operator="is">true</condition>
@@ -184,7 +184,7 @@
           <requirement>HAS_GLES</requirement>
           <visible>false</visible>
           <level>2</level>
-          <default>1</default>
+          <default>0</default>
           <constraints>
             <options>
               <option label="13464">0</option> <!-- DIRECT -->
-- 
2.34.1


From 945c325a7ff47f0aa9ed3988349bfb7bee7938a9 Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Mon, 20 Jan 2020 16:53:52 +0000
Subject: [PATCH 04/24] ffmpeg: Add v4l2 support

---
 cmake/modules/FindFFMPEG.cmake                |     5 +-
 .../0001-rpi-Add-hevc-acceleration.patch      | 23708 ++++++++++++++++
 tools/depends/target/ffmpeg/CMakeLists.txt    |     5 +
 3 files changed, 23717 insertions(+), 1 deletion(-)
 create mode 100644 tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch

diff --git a/cmake/modules/FindFFMPEG.cmake b/cmake/modules/FindFFMPEG.cmake
index 6c6bf973de..13c810591b 100644
--- a/cmake/modules/FindFFMPEG.cmake
+++ b/cmake/modules/FindFFMPEG.cmake
@@ -80,7 +80,10 @@ macro(buildFFMPEG)
                  -DPKG_CONFIG_PATH=${CMAKE_BINARY_DIR}/${CORE_BUILD_DIR}/lib/pkgconfig)
   set(PATCH_COMMAND ${CMAKE_COMMAND} -E copy
                     ${CMAKE_SOURCE_DIR}/tools/depends/target/ffmpeg/CMakeLists.txt
-                    <SOURCE_DIR>)
+                    <SOURCE_DIR> &&
+                    patch -p1 < ${CMAKE_SOURCE_DIR}/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch &&
+                    echo "########################################## patched ffmpeg ##############################"
+                    )
 
   if(CMAKE_GENERATOR STREQUAL Xcode)
     set(FFMPEG_GENERATOR CMAKE_GENERATOR "Unix Makefiles")
diff --git a/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch b/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch
new file mode 100644
index 0000000000..54573fab28
--- /dev/null
+++ b/tools/depends/target/ffmpeg/0001-rpi-Add-hevc-acceleration.patch
@@ -0,0 +1,23708 @@
+diff --git a/configure b/configure
+index b6616f00b6..94c8161b91 100755
+--- a/configure
++++ b/configure
+@@ -205,6 +205,7 @@ External library support:
+   --disable-bzlib          disable bzlib [autodetect]
+   --disable-coreimage      disable Apple CoreImage framework [autodetect]
+   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
++  --disable-epoxy          disable epoxy [autodetect]
+   --enable-frei0r          enable frei0r video filtering [no]
+   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
+                            if openssl, librtmp or gmp is not used [no]
+@@ -281,6 +282,7 @@ External library support:
+                            if openssl, gnutls or mbedtls is not used [no]
+   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
+   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
++  --disable-libudev        disable libudev [autodetect]
+   --enable-libv4l2         enable libv4l2/v4l-utils [no]
+   --enable-libvidstab      enable video stabilization using vid.stab [no]
+   --enable-libvmaf         enable vmaf filter via libvmaf [no]
+@@ -344,12 +346,16 @@ External library support:
+   --enable-libvpl          enable Intel oneVPL code via libvpl if libmfx is not used [no]
+   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
+   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
++  --enable-sand            enable sand video formats [rpi]
++  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
++  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
+   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
+   --disable-nvenc          disable Nvidia video encoding code [autodetect]
+   --enable-omx             enable OpenMAX IL code [no]
+   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
+   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
+   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
++  --enable-v4l2-request    enable V4L2 request API code [no]
+   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
+   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
+   --disable-videotoolbox   disable VideoToolbox code [autodetect]
+@@ -1742,7 +1748,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
+     avfoundation
+     bzlib
+     coreimage
++    epoxy
+     iconv
++    libudev
+     libxcb
+     libxcb_shm
+     libxcb_shape
+@@ -1913,6 +1921,7 @@ HWACCEL_LIBRARY_LIST="
+     mmal
+     omx
+     opencl
++    v4l2_request
+ "
+ 
+ DOCUMENT_LIST="
+@@ -1930,10 +1939,14 @@ FEATURE_LIST="
+     omx_rpi
+     runtime_cpudetect
+     safe_bitstream_reader
++    sand
+     shared
+     small
+     static
+     swscale_alpha
++    vout_drm
++    vout_egl
++    v4l2_req_hevc_vx
+ "
+ 
+ # this list should be kept in linking order
+@@ -2495,6 +2508,7 @@ CONFIG_EXTRA="
+     rtpdec
+     rtpenc_chain
+     rv34dsp
++    sand
+     scene_sad
+     sinewin
+     snappy
+@@ -2999,6 +3013,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
+ dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
+ ffnvcodec_deps_any="libdl LoadLibrary"
+ nvdec_deps="ffnvcodec"
++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
+ vaapi_x11_deps="xlib_x11"
+ videotoolbox_hwaccel_deps="videotoolbox pthreads"
+ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
+@@ -3042,6 +3057,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
++hevc_v4l2request_hwaccel_deps="v4l2_request"
++hevc_v4l2request_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+ hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
+@@ -3549,8 +3566,11 @@ sndio_indev_deps="sndio"
+ sndio_outdev_deps="sndio"
+ v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_indev_suggest="libv4l2"
++v4l2_outdev_deps="libdrm"
+ v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_outdev_suggest="libv4l2"
++vout_drm_outdev_deps="libdrm"
++vout_egl_outdev_deps="xlib epoxy"
+ vfwcap_indev_deps="vfw32 vfwcap_defines"
+ xcbgrab_indev_deps="libxcb"
+ xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
+@@ -3751,6 +3771,7 @@ tonemap_opencl_filter_deps="opencl const_nan"
+ transpose_opencl_filter_deps="opencl"
+ transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
+ transpose_vulkan_filter_deps="vulkan spirv_compiler"
++unsand_filter_select="sand"
+ unsharp_opencl_filter_deps="opencl"
+ uspp_filter_deps="gpl avcodec"
+ vaguedenoiser_filter_deps="gpl"
+@@ -6335,6 +6356,12 @@ if enabled xlib; then
+         disable xlib
+ fi
+ 
++enabled libudev &&
++    check_pkg_config libudev libudev libudev.h udev_new
++
++enabled epoxy &&
++    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
++
+ check_headers direct.h
+ check_headers dirent.h
+ check_headers dxgidebug.h
+@@ -6794,8 +6821,16 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
+                                { enabled libdrm ||
+                                  die "ERROR: rkmpp requires --enable-libdrm"; }
+                              }
++enabled v4l2_request      && { enabled libdrm ||
++                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
++                             { enabled libudev ||
++                               die "ERROR: v4l2-request requires libudev"; }
+ enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
+ 
++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
++
++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
++                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
+ 
+ if enabled gcrypt; then
+     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+@@ -6876,6 +6911,10 @@ if enabled v4l2_m2m; then
+     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
+ fi
+ 
++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
++disable v4l2_req_hevc_vx
++
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+ 
+@@ -7370,6 +7409,9 @@ check_deps $CONFIG_LIST       \
+ 
+ enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86"
+ 
++# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done
++enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx
++
+ case $target_os in
+ haiku)
+     disable memalign
+diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
+index d721a5e721..839da7b472 100644
+--- a/fftools/ffmpeg.c
++++ b/fftools/ffmpeg.c
+@@ -1993,8 +1993,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
+                        av_channel_layout_compare(&ifilter->ch_layout, &frame->ch_layout);
+         break;
+     case AVMEDIA_TYPE_VIDEO:
+-        need_reinit |= ifilter->width  != frame->width ||
+-                       ifilter->height != frame->height;
++        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
++                       ifilter->height != av_frame_cropped_height(frame);
+         break;
+     }
+ 
+@@ -2005,6 +2005,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
+         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
+         need_reinit = 1;
+ 
++    if (no_cvt_hw && fg->graph)
++        need_reinit = 0;
++
+     if (sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX)) {
+         if (!ifilter->displaymatrix || memcmp(sd->data, ifilter->displaymatrix, sizeof(int32_t) * 9))
+             need_reinit = 1;
+@@ -2274,8 +2277,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
+         decoded_frame->top_field_first = ist->top_field_first;
+ 
+     ist->frames_decoded++;
+-
+-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
++    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
+         if (err < 0)
+             goto fail;
+@@ -2607,7 +2609,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo
+         case AVMEDIA_TYPE_VIDEO:
+             ret = decode_video    (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt,
+                                    &decode_failed);
+-            if (!repeating || !pkt || got_output) {
++            // Pi: Do not inc dts if no_cvt_hw set
++            // V4L2 H264 decode has long latency and sometimes spits out a long
++            // stream of output without input. In this case incrementing DTS is wrong.
++            // There may be cases where the condition as written is correct so only
++            // "fix" in the cases which cause problems
++            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
+                 if (pkt && pkt->duration) {
+                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
+                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
+@@ -2756,12 +2763,15 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat
+             break;
+ 
+         if (ist->hwaccel_id == HWACCEL_GENERIC ||
+-            ist->hwaccel_id == HWACCEL_AUTO) {
++            ist->hwaccel_id == HWACCEL_AUTO ||
++            no_cvt_hw) {
+             for (i = 0;; i++) {
+                 config = avcodec_get_hw_config(s->codec, i);
+                 if (!config)
+                     break;
+-                if (!(config->methods &
++                if (no_cvt_hw && (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL))
++                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so trying pix_fmt %d with codec internal hwaccel\n", *p);
++                else if (!(config->methods &
+                       AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX))
+                     continue;
+                 if (config->pix_fmt == *p)
+diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
+index f1412f6446..8f478619b3 100644
+--- a/fftools/ffmpeg.h
++++ b/fftools/ffmpeg.h
+@@ -729,6 +729,8 @@ extern enum VideoSyncMethod video_sync_method;
+ extern float frame_drop_threshold;
+ extern int do_benchmark;
+ extern int do_benchmark_all;
++extern int no_cvt_hw;
++extern int do_deinterlace;
+ extern int do_hex_dump;
+ extern int do_pkt_dump;
+ extern int copy_ts;
+diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
+index 1f5bbf6c4d..f888307762 100644
+--- a/fftools/ffmpeg_filter.c
++++ b/fftools/ffmpeg_filter.c
+@@ -1281,8 +1281,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
+ 
+     ifilter->format = frame->format;
+ 
+-    ifilter->width               = frame->width;
+-    ifilter->height              = frame->height;
++    ifilter->width               = av_frame_cropped_width(frame);
++    ifilter->height              = av_frame_cropped_height(frame);
+     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
+ 
+     ifilter->sample_rate         = frame->sample_rate;
+diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c
+index 88fa782470..740a5e7153 100644
+--- a/fftools/ffmpeg_hw.c
++++ b/fftools/ffmpeg_hw.c
+@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type)
+     char *name;
+     size_t index_pos;
+     int index, index_limit = 1000;
++    if (!type_name)
++        return NULL;
+     index_pos = strlen(type_name);
+     name = av_malloc(index_pos + 4);
+     if (!name)
+diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
+index 055275d813..761db36588 100644
+--- a/fftools/ffmpeg_opt.c
++++ b/fftools/ffmpeg_opt.c
+@@ -71,6 +71,7 @@ enum VideoSyncMethod video_sync_method = VSYNC_AUTO;
+ float frame_drop_threshold = 0;
+ int do_benchmark      = 0;
+ int do_benchmark_all  = 0;
++int no_cvt_hw         = 0;
+ int do_hex_dump       = 0;
+ int do_pkt_dump       = 0;
+ int copy_ts           = 0;
+@@ -1427,6 +1428,8 @@ const OptionDef options[] = {
+         "add timings for benchmarking" },
+     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
+       "add timings for each task" },
++    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
++      "do not auto-convert hw frames to sw" },
+     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
+       "write program-readable progress information", "url" },
+     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 389253f5d0..8b1d669834 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -169,7 +169,10 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
+ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
+ OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
+ OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
+-OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
++OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
++                                          weak_link.o v4l2_req_dmabufs.o
++OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
++					  v4l2_req_devscan.o weak_link.o
+ OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
+ OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
+ 
+@@ -996,6 +999,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
+ OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o
++OBJS-$(CONFIG_V4L2_REQ_HEVC_VX)           += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
+ OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index 39881a1d2b..32bc78e2be 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -2221,6 +2221,17 @@ typedef struct AVHWAccel {
+      * that avctx->hwaccel_priv_data is invalid.
+      */
+     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++    /**
++     * Called if parsing fails
++     *
++     * An error has occured, end_frame will not be called
++     * start_frame & decode_slice may or may not have been called
++     * Optional
++     *
++     * @param avctx the codec context
++     */
++    void (*abort_frame)(AVCodecContext *avctx);
+ } AVHWAccel;
+ 
+ /**
+diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h
+new file mode 100644
+index 0000000000..72cbba0953
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v1.h
+@@ -0,0 +1,229 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	rps;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	num_active_dpb_entries;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	num_rps_poc_st_curr_before;
++	__u8	num_rps_poc_st_curr_after;
++	__u8	num_rps_poc_lt_curr;
++
++	__u8	padding;
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h
+new file mode 100644
+index 0000000000..7cbbbf055f
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v2.h
+@@ -0,0 +1,257 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	rps;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	padding[5];
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u8	num_active_dpb_entries;
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h
+new file mode 100644
+index 0000000000..4e35bd583d
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v3.h
+@@ -0,0 +1,255 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	padding[5];
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u8	num_active_dpb_entries;
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++#endif
+diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
+new file mode 100644
+index 0000000000..c02fdbe5a8
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v4.h
+@@ -0,0 +1,524 @@
++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
++/*
++ *  Video for Linux Two controls header file
++ *
++ *  Copyright (C) 1999-2012 the contributors
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or
++ *  (at your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  Alternatively you can redistribute this file under the terms of the
++ *  BSD license as stated below:
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in
++ *     the documentation and/or other materials provided with the
++ *     distribution.
++ *  3. The names of its contributors may not be used to endorse or promote
++ *     products derived from this software without specific prior written
++ *     permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *  The contents of this header was split off from videodev2.h. All control
++ *  definitions should be added to this header, which is included by
++ *  videodev2.h.
++ */
++
++#ifndef AVCODEC_HEVC_CTRLS_V4_H
++#define AVCODEC_HEVC_CTRLS_V4_H
++
++#include <linux/const.h>
++#include <linux/types.h>
++
++#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS
++#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
++#endif
++#ifndef V4L2_CID_CODEC_STATELESS_BASE
++#define V4L2_CID_CODEC_STATELESS_BASE		(V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900)
++#endif
++
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
++#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
++#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
++
++enum v4l2_stateless_hevc_decode_mode {
++	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_stateless_hevc_start_code {
++	V4L2_STATELESS_HEVC_START_CODE_NONE,
++	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/**
++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
++ *
++ * @video_parameter_set_id: specifies the value of the
++ *			vps_video_parameter_set_id of the active VPS
++ * @seq_parameter_set_id: provides an identifier for the SPS for
++ *			  reference by other syntax elements
++ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
++ *				in units of luma samples
++ * @pic_height_in_luma_samples: specifies the height of each decoded picture
++ *				in units of luma samples
++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
++ *                         samples of the luma array
++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
++ *                           samples of the chroma arrays
++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
++ *                                     the variable MaxPicOrderCntLsb
++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
++ *                                    required size of the decoded picture
++ *                                    buffer for the codec video sequence
++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
++ *				    value of SpsMaxLatencyPictures array
++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
++ *					    luma coding block size
++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
++ *					      the maximum and minimum luma
++ *					      coding block size
++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
++ *					       transform block size
++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
++ *						 the maximum and minimum luma
++ *						 transform block size
++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in inter
++ *					 prediction mode
++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in intra
++ *					 prediction mode
++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
++ *                                    bits used to represent each of PCM sample
++ *                                    values of the luma component
++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
++ *                                      of bits used to represent each of PCM
++ *                                      sample values of the chroma components
++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
++ *                                              minimum size of coding blocks
++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
++ *						  the maximum and minimum size of
++ *						  coding blocks
++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
++ *				 syntax structures included in the SPS
++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
++ *				reference pictures that are specified in the SPS
++ * @chroma_format_idc: specifies the chroma sampling
++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
++ *                             of temporal sub-layers
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_sps {
++	__u8	video_parameter_set_id;
++	__u8	seq_parameter_set_id;
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u8	reserved[6];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++/**
++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
++ *
++ * @pic_parameter_set_id: identifies the PPS for reference by other
++ *			  syntax elements
++ * @num_extra_slice_header_bits: specifies the number of extra slice header
++ *				 bits that are present in the slice header RBSP
++ *				 for coded pictures referring to the PPS.
++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l0_active_minus1
++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l1_active_minus1
++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
++ *		     each slice referring to the PPS
++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
++ *			    tree block size and the minimum luma coding block
++ *			    size of coding units that convey cu_qp_delta_abs
++ *			    and cu_qp_delta_sign_flag
++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
++ *			     partitioning the picture
++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
++ *			  the picture
++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
++ *			 units of coding tree blocks
++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
++ *		       units of coding tree blocks
++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
++ *			  beta divided by 2
++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
++ *			divided by 2
++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
++ *                                    the variable Log2ParMrgLevel
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_PPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_pps {
++	__u8	pic_parameter_set_id;
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++	__u8	reserved;
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++/**
++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
++ *
++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
++ * @flags: long term flag for the reference frame
++ * @field_pic: whether the reference is a field picture or a frame.
++ * @reserved: padding field. Should be zeroed by applications.
++ * @pic_order_cnt_val: the picture order count of the current picture.
++ */
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	reserved;
++	__s32	pic_order_cnt_val;
++};
++
++/**
++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
++ *
++ * @delta_luma_weight_l0: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 0
++ * @luma_offset_l0: the additive offset applied to the luma prediction value
++ *		    for list 0
++ * @delta_chroma_weight_l0: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 0
++ * @chroma_offset_l0: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 0
++ * @delta_luma_weight_l1: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 1
++ * @luma_offset_l1: the additive offset applied to the luma prediction value
++ *		    for list 1
++ * @delta_chroma_weight_l1: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 1
++ * @chroma_offset_l1: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 1
++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
++ *			    all luma weighting factors
++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
++ *				    of the denominator for all chroma
++ *				    weighting factors
++ */
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++/**
++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
++ *
++ * This control is a dynamically sized 1-dimensional array,
++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
++ *
++ * @bit_size: size (in bits) of the current slice data
++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
++ * @num_entry_point_offsets: specifies the number of entry point offset syntax
++ *			     elements in the slice header.
++ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
++ * @colour_plane_id: specifies the colour plane associated with the current slice
++ * @slice_pic_order_cnt: specifies the picture order count
++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 0
++ *                                that may be used to decode the slice
++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 1
++ *                                that may be used to decode the slice
++ * @collocated_ref_idx: specifies the reference index of the collocated picture used
++ *			for temporal motion vector prediction
++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
++ *				   motion vector prediction candidates supported in
++ *				   the slice subtracted from 5
++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
++ *		    blocks in the slice
++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
++ * @slice_act_y_qp_offset: screen content extension parameters
++ * @slice_act_cb_qp_offset: screen content extension parameters
++ * @slice_act_cr_qp_offset: screen content extension parameters
++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
++ *		more fields
++ * @reserved0: padding field. Should be zeroed by applications.
++ * @slice_segment_addr: specifies the address of the first coding tree block in
++ *			the slice segment
++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS
++ * @pred_weight_table: the prediction weight coefficients for inter-picture
++ *		       prediction
++ * @reserved1: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_byte_offset;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__s32	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	__u8	reserved0[3];
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u8	reserved1[2];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++/**
++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
++ *
++ * @pic_order_cnt_val: picture order count
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS of the first slice
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS of the first slice
++ * @num_active_dpb_entries: the number of entries in dpb
++ * @num_poc_st_curr_before: the number of reference pictures in the short-term
++ *			    set that come before the current frame
++ * @num_poc_st_curr_after: the number of reference pictures in the short-term
++ *			   set that come after the current frame
++ * @num_poc_lt_curr: the number of reference pictures in the long-term set
++ * @poc_st_curr_before: provides the index of the short term before references
++ *			in DPB array
++ * @poc_st_curr_after: provides the index of the short term after references
++ *		       in DPB array
++ * @poc_lt_curr: provides the index of the long term references in DPB array
++ * @reserved: padding field. Should be zeroed by applications.
++ * @dpb: the decoded picture buffer, for meta-data about reference frames
++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++	__u8	num_active_dpb_entries;
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	reserved[4];
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++/**
++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
++ *
++ * @scaling_list_4x4: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_8x8: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_16x16:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_32x32:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ */
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
+index 59f9a0ff3e..4ae7222e8b 100644
+--- a/libavcodec/hevc_parser.c
++++ b/libavcodec/hevc_parser.c
+@@ -97,6 +97,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
+     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
+     avctx->level    = ps->sps->ptl.general_ptl.level_idc;
+ 
++    if (ps->sps->chroma_format_idc == 1) {
++        avctx->chroma_sample_location = ps->sps->vui.common.chroma_loc_info_present_flag ?
++            ps->sps->vui.common.chroma_sample_loc_type_top_field + 1 :
++            AVCHROMA_LOC_LEFT;
++    }
++    else if (ps->sps->chroma_format_idc == 2 ||
++             ps->sps->chroma_format_idc == 3) {
++        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++    }
++    else {
++        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++    }
++
+     if (ps->vps->vps_timing_info_present_flag) {
+         num = ps->vps->vps_num_units_in_tick;
+         den = ps->vps->vps_time_scale;
+diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
+index 811e8feff8..f7cf14eabc 100644
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -98,18 +98,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
+         if (!frame->rpl_buf)
+             goto fail;
+ 
+-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+-        if (!frame->tab_mvf_buf)
+-            goto fail;
+-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        if (s->tab_mvf_pool) {
++            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++            if (!frame->tab_mvf_buf)
++                goto fail;
++            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        }
+ 
+-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+-        if (!frame->rpl_tab_buf)
+-            goto fail;
+-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+-        for (j = 0; j < frame->ctb_count; j++)
+-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        if (s->rpl_tab_pool) {
++            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++            if (!frame->rpl_tab_buf)
++                goto fail;
++            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
++            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++            for (j = 0; j < frame->ctb_count; j++)
++                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        }
+ 
+         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -297,14 +301,17 @@ static int init_slice_rpl(HEVCContext *s)
+     int ctb_count    = frame->ctb_count;
+     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+     int i;
++    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+ 
+     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+         return AVERROR_INVALIDDATA;
+ 
+-    for (i = ctb_addr_ts; i < ctb_count; i++)
+-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++    if (frame->rpl_tab) {
++        for (i = ctb_addr_ts; i < ctb_count; i++)
++            frame->rpl_tab[i] = tab;
++    }
+ 
+-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++    frame->refPicList = tab->refPicList;
+ 
+     return 0;
+ }
+diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
+index 567e8d81d4..17f53322fb 100644
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -347,6 +347,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)
+     else
+         avctx->color_range = AVCOL_RANGE_MPEG;
+ 
++    if (sps->chroma_format_idc == 1) {
++        avctx->chroma_sample_location = sps->vui.common.chroma_loc_info_present_flag ?
++            sps->vui.common.chroma_sample_loc_type_top_field + 1 :
++            AVCHROMA_LOC_LEFT;
++    }
++    else if (sps->chroma_format_idc == 2 ||
++             sps->chroma_format_idc == 3) {
++        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++    }
++    else {
++        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++    }
++
+     if (sps->vui.common.colour_description_present_flag) {
+         avctx->color_primaries = sps->vui.common.colour_primaries;
+         avctx->color_trc       = sps->vui.common.transfer_characteristics;
+@@ -403,6 +416,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+                      CONFIG_HEVC_NVDEC_HWACCEL + \
++                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
+                      CONFIG_HEVC_VAAPI_HWACCEL + \
+                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
+                      CONFIG_HEVC_VDPAU_HWACCEL)
+@@ -429,6 +443,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
+@@ -450,6 +467,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+         *fmt++ = AV_PIX_FMT_CUDA;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV444P:
+@@ -516,6 +536,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
+     if (!sps)
+         return 0;
+ 
++    // If hwaccel then we don't need all the s/w decode helper arrays
++    if (s->avctx->hwaccel) {
++        export_stream_params(s, sps);
++
++        s->avctx->pix_fmt = pix_fmt;
++        s->ps.sps = sps;
++        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++        return 0;
++    }
++
+     ret = pic_arrays_init(s, sps);
+     if (ret < 0)
+         goto fail;
+@@ -2870,11 +2900,13 @@ static int hevc_frame_start(HEVCContext *s)
+                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+     int ret;
+ 
+-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    if (s->horizontal_bs) {
++        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
++        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    }
+ 
+     s->is_decoded        = 0;
+     s->first_nal_type    = s->nal_unit_type;
+@@ -3362,8 +3394,13 @@ static int hevc_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
+ 
+     s->ref = NULL;
+     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
+-    if (ret < 0)
++    if (ret < 0) {
++        // Ensure that hwaccel knows this frame is over
++        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame)
++            s->avctx->hwaccel->abort_frame(s->avctx);
++
+         return ret;
++    }
+ 
+     if (avctx->hwaccel) {
+         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
+@@ -3413,15 +3450,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
+         dst->needs_fg = 1;
+     }
+ 
+-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+-    if (!dst->tab_mvf_buf)
+-        goto fail;
+-    dst->tab_mvf = src->tab_mvf;
++    if (src->tab_mvf_buf) {
++        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++        if (!dst->tab_mvf_buf)
++            goto fail;
++        dst->tab_mvf = src->tab_mvf;
++    }
+ 
+-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+-    if (!dst->rpl_tab_buf)
+-        goto fail;
+-    dst->rpl_tab = src->rpl_tab;
++    if (src->rpl_tab_buf) {
++        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++        if (!dst->rpl_tab_buf)
++            goto fail;
++        dst->rpl_tab = src->rpl_tab;
++    }
+ 
+     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+     if (!dst->rpl_buf)
+@@ -3731,6 +3772,9 @@ const FFCodec ff_hevc_decoder = {
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+                                HWACCEL_NVDEC(hevc),
+ #endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(hevc),
++#endif
+ #if CONFIG_HEVC_VAAPI_HWACCEL
+                                HWACCEL_VAAPI(hevc),
+ #endif
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index aca55831f3..f32d1c4ec4 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
+ extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+ extern const AVHWAccel ff_hevc_nvdec_hwaccel;
++extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
+ extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+ extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+ extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
+index 721424912c..b8aa383071 100644
+--- a/libavcodec/hwconfig.h
++++ b/libavcodec/hwconfig.h
+@@ -24,6 +24,7 @@
+ 
+ 
+ #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
++#define HWACCEL_CAP_MT_SAFE         (1 << 1)
+ 
+ 
+ typedef struct AVCodecHWConfigInternal {
+@@ -70,6 +71,8 @@ typedef struct AVCodecHWConfigInternal {
+     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
+ #define HWACCEL_NVDEC(codec) \
+     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
++#define HWACCEL_V4L2REQUEST(codec) \
++    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
+ #define HWACCEL_VAAPI(codec) \
+     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
+ #define HWACCEL_VDPAU(codec) \
+diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+index 3092f58510..6f41b41ac4 100644
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+  * MMAL Video Decoder
+  */
+ 
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ #include <stdatomic.h>
+ 
+ #include "avcodec.h"
+diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
+index d9d5afaa82..b14f8e9360 100644
+--- a/libavcodec/pthread_frame.c
++++ b/libavcodec/pthread_frame.c
+@@ -204,7 +204,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
+ 
+         /* if the previous thread uses hwaccel then we take the lock to ensure
+          * the threads don't run concurrently */
+-        if (avctx->hwaccel) {
++        if (avctx->hwaccel &&
++            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+             pthread_mutex_lock(&p->parent->hwaccel_mutex);
+             p->hwaccel_serializing = 1;
+         }
+@@ -230,7 +231,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
+             p->hwaccel_serializing = 0;
+             pthread_mutex_unlock(&p->parent->hwaccel_mutex);
+         }
+-        av_assert0(!avctx->hwaccel);
++        av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
+ 
+         if (p->async_serializing) {
+             p->async_serializing = 0;
+@@ -318,6 +319,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
+         }
+ 
+         dst->hwaccel_flags = src->hwaccel_flags;
++        if (src->hwaccel &&
++            (src->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
++            dst->hwaccel = src->hwaccel;
++            dst->hwaccel_context = src->hwaccel_context;
++            dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
++        }
+ 
+         err = av_buffer_replace(&dst->internal->pool, src->internal->pool);
+         if (err < 0)
+@@ -433,10 +440,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx,
+     }
+ 
+     /* transfer the stashed hwaccel state, if any */
+-    av_assert0(!p->avctx->hwaccel);
+-    FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
+-    FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
+-    FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
++    av_assert0(!p->avctx->hwaccel || (p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
++    if (p->avctx->hwaccel &&
++        !(p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
++        FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
++        FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
++        FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
++    }
+ 
+     av_packet_unref(p->avpkt);
+     ret = av_packet_ref(p->avpkt, avpkt);
+@@ -590,7 +600,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
+ 
+     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
+ 
+-    if (avctx->hwaccel && !p->hwaccel_serializing) {
++    if (avctx->hwaccel &&
++        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
++        !p->hwaccel_serializing) {
+         pthread_mutex_lock(&p->parent->hwaccel_mutex);
+         p->hwaccel_serializing = 1;
+     }
+@@ -607,9 +619,12 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
+      * this is done here so that this worker thread can wipe its own hwaccel
+      * state after decoding, without requiring synchronization */
+     av_assert0(!p->parent->stash_hwaccel);
+-    p->parent->stash_hwaccel         = avctx->hwaccel;
+-    p->parent->stash_hwaccel_context = avctx->hwaccel_context;
+-    p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
++    if (avctx->hwaccel &&
++        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
++        p->parent->stash_hwaccel         = avctx->hwaccel;
++        p->parent->stash_hwaccel_context = avctx->hwaccel_context;
++        p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
++    }
+ 
+     pthread_mutex_lock(&p->progress_mutex);
+     if(atomic_load(&p->state) == STATE_SETUP_FINISHED){
+@@ -664,6 +679,15 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
+ 
+     park_frame_worker_threads(fctx, thread_count);
+ 
++     if (fctx->prev_thread &&
++         avctx->hwaccel && (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
++         avctx->internal->hwaccel_priv_data !=
++                             fctx->prev_thread->avctx->internal->hwaccel_priv_data) {
++        if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n");
++        }
++    }
++
+     for (i = 0; i < thread_count; i++) {
+         PerThreadContext *p = &fctx->threads[i];
+         AVCodecContext *ctx = p->avctx;
+@@ -707,10 +731,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
+ 
+     /* if we have stashed hwaccel state, move it to the user-facing context,
+      * so it will be freed in avcodec_close() */
+-    av_assert0(!avctx->hwaccel);
+-    FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
+-    FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
+-    FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
++    av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
++    if (avctx->hwaccel &&
++        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
++        FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
++        FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
++        FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
++    }
+ 
+     av_freep(&avctx->internal->thread_ctx);
+ }
+diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+index 1e5b48d1e0..1e689f9ee0 100644
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -295,6 +295,12 @@ static const PixelFormatTag raw_pix_fmt_tags[] = {
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+ 
++    /* RPI (Might as well define for everything) */
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
++    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
++
+     { AV_PIX_FMT_NONE, 0 },
+ };
+ 
+diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+index 8c577006d9..8ca0379e12 100644
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -24,6 +24,7 @@
+  * Raw Video Encoder
+  */
+ 
++#include "config.h"
+ #include "avcodec.h"
+ #include "codec_internal.h"
+ #include "encode.h"
+@@ -33,6 +34,10 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -46,22 +51,114 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
+     return 0;
+ }
+ 
++#if CONFIG_SAND
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3 / 2;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
++    return 0;
++}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++
++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height * 2;
++    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
++    return 0;
++}
++#endif
++
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+-                      const AVFrame *frame, int *got_packet)
++                      const AVFrame *src_frame, int *got_packet)
+ {
+-    int ret = av_image_get_buffer_size(frame->format,
+-                                       frame->width, frame->height, 1);
++    int ret;
++    AVFrame * frame = NULL;
+ 
+-    if (ret < 0)
++#if CONFIG_SAND
++    if (av_rpi_is_sand_frame(src_frame)) {
++        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
++            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
++            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
++        *got_packet = (ret == 0);
+         return ret;
++    }
++#endif
++
++    if ((frame = av_frame_clone(src_frame)) == NULL) {
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
++        goto fail;
++
++    ret = av_image_get_buffer_size(frame->format,
++                                       frame->width, frame->height, 1);
++    if (ret < 0)
++        goto fail;
+ 
+     if ((ret = ff_get_encode_buffer(avctx, pkt, ret, 0)) < 0)
+-        return ret;
++        goto fail;
+     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+                                        (const uint8_t **)frame->data, frame->linesize,
+                                        frame->format,
+                                        frame->width, frame->height, 1)) < 0)
+-        return ret;
++        goto fail;
+ 
+     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
+        frame->format   == AV_PIX_FMT_YUYV422) {
+@@ -77,8 +174,15 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+             AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16);
+         }
+     }
++    pkt->flags |= AV_PKT_FLAG_KEY;
++    av_frame_free(&frame);
+     *got_packet = 1;
+     return 0;
++
++fail:
++    av_frame_free(&frame);
++    *got_packet = 0;
++    return ret;
+ }
+ 
+ const FFCodec ff_rawvideo_encoder = {
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index 3f5471067a..8d80d19788 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -21,6 +21,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include <drm_fourcc.h>
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <sys/mman.h>
+@@ -28,57 +29,89 @@
+ #include <fcntl.h>
+ #include <poll.h>
+ #include "libavcodec/avcodec.h"
++#include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext.h"
+ #include "v4l2_context.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_m2m.h"
++#include "v4l2_req_dmabufs.h"
++#include "weak_link.h"
+ 
+ #define USEC_PER_SEC 1000000
+-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+ 
+-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
+ {
+-    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+-        container_of(buf->context, V4L2m2mContext, output) :
+-        container_of(buf->context, V4L2m2mContext, capture);
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Buffer *buf)
++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
+ {
+-    return buf_to_m2mctx(buf)->avctx;
++    return ctx_to_m2mctx(buf->context);
+ }
+ 
+-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
++static inline AVCodecContext *logger(const V4L2Buffer * const buf)
+ {
+-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++    return buf_to_m2mctx(buf)->avctx;
++}
+ 
+-    if (s->avctx->pkt_timebase.num)
+-        return s->avctx->pkt_timebase;
+-    return s->avctx->time_base;
++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
++{
++    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++    const AVRational tb = s->avctx->pkt_timebase.num ?
++        s->avctx->pkt_timebase :
++        s->avctx->time_base;
++    return tb.num && tb.den ? tb : v4l2_timebase;
+ }
+ 
+-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
++static inline struct timeval tv_from_int(const int64_t t)
+ {
+-    int64_t v4l2_pts;
++    return (struct timeval){
++        .tv_usec = t % USEC_PER_SEC,
++        .tv_sec  = t / USEC_PER_SEC
++    };
++}
+ 
+-    if (pts == AV_NOPTS_VALUE)
+-        pts = 0;
++static inline int64_t int_from_tv(const struct timeval t)
++{
++    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+ 
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
+     /* convert pts to v4l2 timebase */
+-    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++    const int64_t v4l2_pts =
++        pts == AV_NOPTS_VALUE ? 0 :
++            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
++    out->buf.timestamp = tv_from_int(v4l2_pts);
+ }
+ 
+-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
+ {
+-    int64_t v4l2_pts;
+-
++    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
+     /* convert pts back to encoder timebase */
+-    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+-                        avbuf->buf.timestamp.tv_usec;
++    return
++        avbuf->context->no_pts_rescale ? v4l2_pts :
++        v4l2_pts == 0 ? AV_NOPTS_VALUE :
++            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
++}
+ 
+-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
++{
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        out->planes[plane].bytesused = bytesused;
++        out->planes[plane].length = length;
++    } else {
++        out->buf.bytesused = bytesused;
++        out->buf.length = length;
++    }
+ }
+ 
+ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+@@ -115,6 +148,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+     return AVCOL_PRI_UNSPECIFIED;
+ }
+ 
++static void v4l2_set_color(V4L2Buffer *buf,
++                           const enum AVColorPrimaries avcp,
++                           const enum AVColorSpace avcs,
++                           const enum AVColorTransferCharacteristic avxc)
++{
++    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
++    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
++    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
++
++    switch (avcp) {
++    case AVCOL_PRI_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        ycbcr = V4L2_YCBCR_ENC_709;
++        break;
++    case AVCOL_PRI_BT470M:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        ycbcr = V4L2_YCBCR_ENC_601;
++        break;
++    case AVCOL_PRI_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_PRI_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_PRI_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_PRI_BT2020:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    case AVCOL_PRI_SMPTE428:
++    case AVCOL_PRI_SMPTE431:
++    case AVCOL_PRI_SMPTE432:
++    case AVCOL_PRI_EBU3213:
++    case AVCOL_PRI_RESERVED:
++    case AVCOL_PRI_FILM:
++    case AVCOL_PRI_UNSPECIFIED:
++    default:
++        break;
++    }
++
++    switch (avcs) {
++    case AVCOL_SPC_RGB:
++        cs = V4L2_COLORSPACE_SRGB;
++        break;
++    case AVCOL_SPC_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        break;
++    case AVCOL_SPC_FCC:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        break;
++    case AVCOL_SPC_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_SPC_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_SPC_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_SPC_BT2020_CL:
++        cs = V4L2_COLORSPACE_BT2020;
++        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
++        break;
++    case AVCOL_SPC_BT2020_NCL:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    default:
++        break;
++    }
++
++    switch (xfer) {
++    case AVCOL_TRC_BT709:
++        xfer = V4L2_XFER_FUNC_709;
++        break;
++    case AVCOL_TRC_IEC61966_2_1:
++        xfer = V4L2_XFER_FUNC_SRGB;
++        break;
++    case AVCOL_TRC_SMPTE240M:
++        xfer = V4L2_XFER_FUNC_SMPTE240M;
++        break;
++    case AVCOL_TRC_SMPTE2084:
++        xfer = V4L2_XFER_FUNC_SMPTE2084;
++        break;
++    default:
++        break;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++        buf->context->format.fmt.pix_mp.colorspace = cs;
++        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
++        buf->context->format.fmt.pix_mp.xfer_func = xfer;
++    } else {
++        buf->context->format.fmt.pix.colorspace = cs;
++        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
++        buf->context->format.fmt.pix.xfer_func = xfer;
++    }
++}
++
+ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+ {
+     enum v4l2_quantization qt;
+@@ -133,6 +265,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+      return AVCOL_RANGE_UNSPECIFIED;
+ }
+ 
++static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
++{
++    const enum v4l2_quantization q =
++        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
++        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
++            V4L2_QUANTIZATION_DEFAULT;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++        buf->context->format.fmt.pix_mp.quantization = q;
++    } else {
++        buf->context->format.fmt.pix.quantization = q;
++    }
++}
++
+ static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
+ {
+     enum v4l2_ycbcr_encoding ycbcr;
+@@ -209,73 +355,178 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+     return AVCOL_TRC_UNSPECIFIED;
+ }
+ 
+-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
+ {
+-    V4L2Buffer* avbuf = opaque;
+-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
++}
+ 
+-    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
+-        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
++{
++    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
++}
+ 
+-        if (s->reinit) {
+-            if (!atomic_load(&s->refcount))
+-                sem_post(&s->refsync);
+-        } else {
+-            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
+-                /* no need to queue more buffers to the driver */
+-                avbuf->status = V4L2BUF_AVAILABLE;
+-            }
+-            else if (avbuf->context->streamon)
+-                ff_v4l2_buffer_enqueue(avbuf);
+-        }
++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
++{
++    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
++        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
++}
+ 
+-        av_buffer_unref(&avbuf->context_ref);
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
++{
++    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++    AVDRMLayerDescriptor *layer;
++
++    /* fill the DRM frame descriptor */
++    drm_desc->nb_objects = avbuf->num_planes;
++    drm_desc->nb_layers = 1;
++
++    layer = &drm_desc->layers[0];
++    layer->nb_planes = avbuf->num_planes;
++
++    for (int i = 0; i < avbuf->num_planes; i++) {
++        layer->planes[i].object_index = i;
++        layer->planes[i].offset = avbuf->plane_info[i].offset;
++        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+     }
++
++    switch (avbuf->context->av_pix_fmt) {
++    case AV_PIX_FMT_YUYV422:
++
++        layer->format = DRM_FORMAT_YUYV;
++        layer->nb_planes = 1;
++
++        break;
++
++    case AV_PIX_FMT_NV12:
++    case AV_PIX_FMT_NV21:
++
++        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
++            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
++
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 2;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            avbuf->context->format.fmt.pix.height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++        break;
++
++    case AV_PIX_FMT_YUV420P:
++
++        layer->format = DRM_FORMAT_YUV420;
++
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 3;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            avbuf->context->format.fmt.pix.height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++        layer->planes[2].object_index = 0;
++        layer->planes[2].offset = layer->planes[1].offset +
++            ((avbuf->plane_info[0].bytesperline *
++              avbuf->context->format.fmt.pix.height) >> 2);
++        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++        break;
++
++    default:
++        drm_desc->nb_layers = 0;
++        break;
++    }
++
++    return (uint8_t *) drm_desc;
+ }
+ 
+-static int v4l2_buf_increase_ref(V4L2Buffer *in)
++static void v4l2_free_bufref(void *opaque, uint8_t *data)
+ {
+-    V4L2m2mContext *s = buf_to_m2mctx(in);
++    AVBufferRef * bufref = (AVBufferRef *)data;
++    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
++    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
+ 
+-    if (in->context_ref)
+-        atomic_fetch_add(&in->context_refcount, 1);
+-    else {
+-        in->context_ref = av_buffer_ref(s->self_ref);
+-        if (!in->context_ref)
+-            return AVERROR(ENOMEM);
++    if (ctx != NULL) {
++        // Buffer still attached to context
++        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+ 
+-        in->context_refcount = 1;
+-    }
++        ff_mutex_lock(&ctx->lock);
+ 
+-    in->status = V4L2BUF_RET_USER;
+-    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
++        ff_v4l2_buffer_set_avail(avbuf);
+ 
+-    return 0;
++        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
++            /* no need to queue more buffers to the driver */
++        }
++        else if (ctx->streamon) {
++            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
++            avbuf->buf.timestamp.tv_sec = 0;
++            avbuf->buf.timestamp.tv_usec = 0;
++            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
++        }
++        else {
++            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
++        }
++
++        ff_mutex_unlock(&ctx->lock);
++    }
++
++    ff_weak_link_unlock(avbuf->context_wl);
++    av_buffer_unref(&bufref);
+ }
+ 
+-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
++static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i)
+ {
+-    int ret;
++    return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length;
++}
+ 
+-    if (plane >= in->num_planes)
+-        return AVERROR(EINVAL);
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
++{
++    int i, ret;
++    const V4L2m2mContext * const s = buf_to_m2mctx(avbuf);
+ 
+-    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
+-    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
+-                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
+-    if (!*buf)
+-        return AVERROR(ENOMEM);
++    for (i = 0; i < avbuf->num_planes; i++) {
++        int dma_fd = -1;
++        const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i);
++
++        if (s->db_ctl != NULL) {
++            if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL)
++                return AVERROR(ENOMEM);
++            dma_fd = dmabuf_fd(avbuf->dmabuf[i]);
++            if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type))
++                avbuf->buf.m.planes[i].m.fd = dma_fd;
++            else
++                avbuf->buf.m.fd = dma_fd;
++        }
++        else {
++            struct v4l2_exportbuffer expbuf;
++            memset(&expbuf, 0, sizeof(expbuf));
++
++            expbuf.index = avbuf->buf.index;
++            expbuf.type = avbuf->buf.type;
++            expbuf.plane = i;
++
++            ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf);
++            if (ret < 0)
++                return AVERROR(errno);
++            dma_fd = expbuf.fd;
++        }
+ 
+-    ret = v4l2_buf_increase_ref(in);
+-    if (ret)
+-        av_buffer_unref(buf);
++        avbuf->drm_frame.objects[i].size = blen;
++        avbuf->drm_frame.objects[i].fd = dma_fd;
++        avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
++    }
+ 
+-    return ret;
++    return 0;
+ }
+ 
+ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
+ {
+     unsigned int bytesused, length;
++    int rv = 0;
+ 
+     if (plane >= out->num_planes)
+         return AVERROR(EINVAL);
+@@ -283,32 +534,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
+     length = out->plane_info[plane].length;
+     bytesused = FFMIN(size+offset, length);
+ 
+-    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
+-
+-    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+-        out->planes[plane].bytesused = bytesused;
+-        out->planes[plane].length = length;
+-    } else {
+-        out->buf.bytesused = bytesused;
+-        out->buf.length = length;
++    if (size > length - offset) {
++        size = length - offset;
++        rv = AVERROR(ENOMEM);
+     }
+ 
+-    return 0;
++    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
++
++    set_buf_length(out, plane, bytesused, length);
++
++    return rv;
++}
++
++static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
++{
++    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
++    AVBufferRef * newbuf;
++
++    if (!bufref)
++        return NULL;
++
++    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
++    if (newbuf == NULL)
++        av_buffer_unref(&bufref);
++
++    avbuf->status = V4L2BUF_RET_USER;
++    return newbuf;
+ }
+ 
+ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+-    int i, ret;
++    int i;
+ 
+     frame->format = avbuf->context->av_pix_fmt;
+ 
+-    for (i = 0; i < avbuf->num_planes; i++) {
+-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
+-        if (ret)
+-            return ret;
++    frame->buf[0] = wrap_avbuf(avbuf);
++    if (frame->buf[0] == NULL)
++        return AVERROR(ENOMEM);
+ 
++    if (buf_to_m2mctx(avbuf)->output_drm) {
++        /* 1. get references to the actual data */
++        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
++        return 0;
++    }
++
++
++    /* 1. get references to the actual data */
++    for (i = 0; i < avbuf->num_planes; i++) {
++        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
+         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
+-        frame->data[i] = frame->buf[i]->data;
+     }
+ 
+     /* fixup special cases */
+@@ -317,17 +593,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+     case AV_PIX_FMT_NV21:
+         if (avbuf->num_planes > 1)
+             break;
+-        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
+-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
++        frame->linesize[1] = frame->linesize[0];
++        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+         break;
+ 
+     case AV_PIX_FMT_YUV420P:
+         if (avbuf->num_planes > 1)
+             break;
+-        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
+-        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
+-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+-        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
++        frame->linesize[1] = frame->linesize[0] / 2;
++        frame->linesize[2] = frame->linesize[1];
++        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
++        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
+         break;
+ 
+     default:
+@@ -337,68 +613,127 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+     return 0;
+ }
+ 
++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
++{
++    if (dst_stride == src_stride && w + 32 >= dst_stride) {
++        memcpy(dst, src, dst_stride * h);
++    }
++    else {
++        while (--h >= 0) {
++            memcpy(dst, src, w);
++            dst += dst_stride;
++            src += src_stride;
++        }
++    }
++}
++
++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
++{
++    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
++}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++        return AVERROR(EINVAL);
++
++    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        // Only currently cope with single buffer types
++        if (out->buf.length != 1)
++            return AVERROR_PATCHWELCOME;
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->planes[0].m.fd = src->objects[0].fd;
++    }
++    else {
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->buf.m.fd      = src->objects[0].fd;
++    }
++
++    // No need to copy src AVDescriptor and if we did then we may confuse
++    // fd close on free
++    out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++    return 0;
++}
++
+ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+ {
+-    int i, ret;
+-    struct v4l2_format fmt = out->context->format;
+-    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+-                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
+-    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+-                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
+-    int is_planar_format = 0;
+-
+-    switch (pixel_format) {
+-    case V4L2_PIX_FMT_YUV420M:
+-    case V4L2_PIX_FMT_YVU420M:
+-#ifdef V4L2_PIX_FMT_YUV422M
+-    case V4L2_PIX_FMT_YUV422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU422M
+-    case V4L2_PIX_FMT_YVU422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YUV444M
+-    case V4L2_PIX_FMT_YUV444M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU444M
+-    case V4L2_PIX_FMT_YVU444M:
+-#endif
+-    case V4L2_PIX_FMT_NV12M:
+-    case V4L2_PIX_FMT_NV21M:
+-    case V4L2_PIX_FMT_NV12MT_16X16:
+-    case V4L2_PIX_FMT_NV12MT:
+-    case V4L2_PIX_FMT_NV16M:
+-    case V4L2_PIX_FMT_NV61M:
+-        is_planar_format = 1;
+-    }
+-
+-    if (!is_planar_format) {
+-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+-        int planes_nb = 0;
+-        int offset = 0;
+-
+-        for (i = 0; i < desc->nb_components; i++)
+-            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+-
+-        for (i = 0; i < planes_nb; i++) {
+-            int size, h = height;
+-            if (i == 1 || i == 2) {
++    int i;
++    int num_planes = 0;
++    int pel_strides[4] = {0};
++
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++
++    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
++        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
++        return -1;
++    }
++
++    for (i = 0; i != desc->nb_components; ++i) {
++        if (desc->comp[i].plane >= num_planes)
++            num_planes = desc->comp[i].plane + 1;
++        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
++    }
++
++    if (out->num_planes > 1) {
++        if (num_planes != out->num_planes) {
++            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
++            return -1;
++        }
++        for (i = 0; i != num_planes; ++i) {
++            int w = frame->width;
++            int h = frame->height;
++            if (is_chroma(desc, i, num_planes)) {
++                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+             }
+-            size = frame->linesize[i] * h;
+-            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset);
+-            if (ret)
+-                return ret;
+-            offset += size;
++
++            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
++                   frame->data[i], frame->linesize[i],
++                   w * pel_strides[i], h);
++            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
+         }
+-        return 0;
+     }
++    else
++    {
++        unsigned int offset = 0;
++
++        for (i = 0; i != num_planes; ++i) {
++            int w = frame->width;
++            int h = frame->height;
++            int dst_stride = out->plane_info[0].bytesperline;
++            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
++
++            if (is_chroma(desc, i, num_planes)) {
++                // Is chroma
++                dst_stride >>= desc->log2_chroma_w;
++                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
++                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
++                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
++            }
++            else {
++                // Is luma or alpha
++                offset += dst_stride * out->context->height;
++            }
++            if (offset > out->plane_info[0].length) {
++                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
++                return -1;
++            }
+ 
+-    for (i = 0; i < out->num_planes; i++) {
+-        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0);
+-        if (ret)
+-            return ret;
++            cpy_2d(dst, dst_stride,
++                   frame->data[i], frame->linesize[i],
++                   w * pel_strides[i], h);
++        }
++        set_buf_length(out, 0, offset, out->plane_info[0].length);
+     }
+-
+     return 0;
+ }
+ 
+@@ -408,16 +743,31 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+  *
+  ******************************************************************************/
+ 
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
+ {
+-    v4l2_set_pts(out, frame->pts);
+-
+-    return v4l2_buffer_swframe_to_buf(frame, out);
++    out->buf.flags = frame->key_frame ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
++    // Beware that colour info is held in format rather than the actual
++    // v4l2 buffer struct so this may not be as useful as you might hope
++    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
++    v4l2_set_color_range(out, frame->color_range);
++    // PTS & interlace are buffer vars
++    if (track_ts)
++        out->buf.timestamp = tv_from_int(track_ts);
++    else
++        v4l2_set_pts(out, frame->pts);
++    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
++
++    return frame->format == AV_PIX_FMT_DRM_PRIME ?
++        v4l2_buffer_primeframe_to_buf(frame, out) :
++        v4l2_buffer_swframe_to_buf(frame, out);
+ }
+ 
+ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+     int ret;
++    V4L2Context * const ctx = avbuf->context;
+ 
+     av_frame_unref(frame);
+ 
+@@ -428,17 +778,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ 
+     /* 2. get frame information */
+     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
++    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
++            AV_PICTURE_TYPE_NONE;
+     frame->color_primaries = v4l2_get_color_primaries(avbuf);
+     frame->colorspace = v4l2_get_color_space(avbuf);
+     frame->color_range = v4l2_get_color_range(avbuf);
+     frame->color_trc = v4l2_get_color_trc(avbuf);
+     frame->pts = v4l2_get_pts(avbuf);
+     frame->pkt_dts = AV_NOPTS_VALUE;
++    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
++    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
+ 
+     /* these values are updated also during re-init in v4l2_process_driver_event */
+-    frame->height = avbuf->context->height;
+-    frame->width = avbuf->context->width;
+-    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
++    frame->height = ctx->height;
++    frame->width = ctx->width;
++    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
++
++    if (ctx->selection.height && ctx->selection.width) {
++        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
++        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
++        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
++            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
++        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
++            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
++    }
+ 
+     /* 3. report errors upstream */
+     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
+@@ -451,15 +816,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ 
+ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+ {
+-    int ret;
+-
+     av_packet_unref(pkt);
+-    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
+-    if (ret)
+-        return ret;
++
++    pkt->buf = wrap_avbuf(avbuf);
++    if (pkt->buf == NULL)
++        return AVERROR(ENOMEM);
+ 
+     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
+-    pkt->data = pkt->buf->data;
++    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
++    pkt->flags = 0;
+ 
+     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
+         pkt->flags |= AV_PKT_FLAG_KEY;
+@@ -474,39 +839,107 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+     return 0;
+ }
+ 
+-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp)
+ {
+     int ret;
+ 
+-    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0);
+-    if (ret)
++    if (extlen) {
++        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
++        if (ret)
++            return ret;
++    }
++
++    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
++    if (ret && ret != AVERROR(ENOMEM))
+         return ret;
+ 
+-    v4l2_set_pts(out, pkt->pts);
++    if (timestamp)
++        out->buf.timestamp = tv_from_int(timestamp);
++    else
++        v4l2_set_pts(out, pkt->pts);
+ 
+-    if (pkt->flags & AV_PKT_FLAG_KEY)
+-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
++    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
+ 
+-    return 0;
++    return ret;
++}
++
++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++{
++    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
+ }
+ 
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
++
++static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
++{
++    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
++    int i;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
++        struct V4L2Plane_info *p = avbuf->plane_info + i;
++        if (p->mm_addr != NULL)
++            munmap(p->mm_addr, p->length);
++    }
++
++    if (avbuf->dmabuf[0] == NULL) {
++        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++            if (avbuf->drm_frame.objects[i].fd != -1)
++                close(avbuf->drm_frame.objects[i].fd);
++        }
++    }
++    else {
++        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) {
++            dmabuf_free(avbuf->dmabuf[i]);
++        }
++    }
++
++    av_buffer_unref(&avbuf->ref_buf);
++
++    ff_weak_link_unref(&avbuf->context_wl);
++
++    av_free(avbuf);
++}
++
++
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
+ {
+-    V4L2Context *ctx = avbuf->context;
+     int ret, i;
++    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
++    AVBufferRef * bufref;
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+ 
+-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
++    *pbufref = NULL;
++    if (avbuf == NULL)
++        return AVERROR(ENOMEM);
++
++    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
++    if (bufref == NULL) {
++        av_free(avbuf);
++        return AVERROR(ENOMEM);
++    }
++
++    avbuf->context = ctx;
++    avbuf->buf.memory = mem;
+     avbuf->buf.type = ctx->type;
+     avbuf->buf.index = index;
+ 
++    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++        avbuf->drm_frame.objects[i].fd = -1;
++    }
++
++    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
++
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->buf.length = VIDEO_MAX_PLANES;
+         avbuf->buf.m.planes = avbuf->planes;
+     }
+ 
+-    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
++    ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+     if (ret < 0)
+-        return AVERROR(errno);
++        goto fail;
+ 
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->num_planes = 0;
+@@ -519,6 +952,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+         avbuf->num_planes = 1;
+ 
+     for (i = 0; i < avbuf->num_planes; i++) {
++        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+ 
+         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -526,25 +961,31 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+ 
+         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
+-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
++            avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset;
++
++            if (want_mmap)
++                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
++                                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+         } else {
+             avbuf->plane_info[i].length = avbuf->buf.length;
+-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
+-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
++            avbuf->plane_info[i].offset = 0;
++
++            if (want_mmap)
++                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
++                                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+         }
+ 
+-        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
+-            return AVERROR(ENOMEM);
++        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
++            avbuf->plane_info[i].mm_addr = NULL;
++            ret = AVERROR(ENOMEM);
++            goto fail;
++        }
+     }
+ 
+     avbuf->status = V4L2BUF_AVAILABLE;
+ 
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+-        return 0;
+-
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->buf.m.planes = avbuf->planes;
+         avbuf->buf.length   = avbuf->num_planes;
+@@ -554,20 +995,53 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+         avbuf->buf.length    = avbuf->planes[0].length;
+     }
+ 
+-    return ff_v4l2_buffer_enqueue(avbuf);
++    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++        if (s->output_drm) {
++            ret = v4l2_buffer_export_drm(avbuf);
++            if (ret) {
++                av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n");
++                goto fail;
++            }
++        }
++    }
++
++    *pbufref = bufref;
++    return 0;
++
++fail:
++    av_buffer_unref(&bufref);
++    return ret;
+ }
+ 
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
+ {
+     int ret;
++    int qc;
+ 
+-    avbuf->buf.flags = avbuf->flags;
++    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
++        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++               avbuf->context->name, avbuf->buf.index,
++               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
++               avbuf->context->q_count);
++    }
+ 
+     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
+-    if (ret < 0)
+-        return AVERROR(errno);
++    if (ret < 0) {
++        int err = errno;
++        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
++               avbuf->context->name, avbuf->buf.index,
++               err, strerror(err));
++        return AVERROR(err);
++    }
+ 
++    // Lock not wanted - if called from buffer free then lock already obtained
++    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+     avbuf->status = V4L2BUF_IN_DRIVER;
++    pthread_cond_broadcast(&avbuf->context->cond);
++
++    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++           avbuf->context->name, avbuf->buf.index,
++           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
+ 
+     return 0;
+ }
+diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
+index 3d2ff1b9a5..444ad94b14 100644
+--- a/libavcodec/v4l2_buffers.h
++++ b/libavcodec/v4l2_buffers.h
+@@ -28,31 +28,47 @@
+ #include <stddef.h>
+ #include <linux/videodev2.h>
+ 
++#include "avcodec.h"
+ #include "libavutil/buffer.h"
+ #include "libavutil/frame.h"
++#include "libavutil/hwcontext_drm.h"
+ #include "packet.h"
+ 
+ enum V4L2Buffer_status {
+     V4L2BUF_AVAILABLE,
+     V4L2BUF_IN_DRIVER,
++    V4L2BUF_IN_USE,
+     V4L2BUF_RET_USER,
+ };
+ 
+ /**
+  * V4L2Buffer (wrapper for v4l2_buffer management)
+  */
++struct V4L2Context;
++struct ff_weak_link_client;
++struct dmabuf_h;
++
+ typedef struct V4L2Buffer {
+-    /* each buffer needs to have a reference to its context */
++    /* each buffer needs to have a reference to its context
++     * The pointer is good enough for most operation but once the buffer has
++     * been passed to the user the buffer may become orphaned so for free ops
++     * the weak link must be used to ensure that the context is actually
++     * there
++     */
+     struct V4L2Context *context;
++    struct ff_weak_link_client *context_wl;
+ 
+-    /* This object is refcounted per-plane, so we need to keep track
+-     * of how many context-refs we are holding. */
+-    AVBufferRef *context_ref;
+-    atomic_uint context_refcount;
++    /* DRM descriptor */
++    AVDRMFrameDescriptor drm_frame;
++    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++     * are done
++     */
++    AVBufferRef * ref_buf;
+ 
+     /* keep track of the mmap address and mmap length */
+     struct V4L2Plane_info {
+-        int bytesperline;
++        size_t bytesperline;
++        size_t offset;
+         void * mm_addr;
+         size_t length;
+     } plane_info[VIDEO_MAX_PLANES];
+@@ -63,9 +79,9 @@ typedef struct V4L2Buffer {
+     struct v4l2_buffer buf;
+     struct v4l2_plane planes[VIDEO_MAX_PLANES];
+ 
+-    int flags;
+     enum V4L2Buffer_status status;
+ 
++    struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here
+ } V4L2Buffer;
+ 
+ /**
+@@ -101,6 +117,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
+  */
+ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+ 
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp);
++
+ /**
+  * Extracts the data from an AVFrame to a V4L2Buffer
+  *
+@@ -109,7 +129,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+ 
+ /**
+  * Initializes a V4L2Buffer
+@@ -119,7 +139,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
+ 
+ /**
+  * Enqueues a V4L2Buffer
+@@ -130,5 +150,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+  */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+ 
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++    avbuf->status = V4L2BUF_AVAILABLE;
++    av_buffer_unref(&avbuf->ref_buf);
++}
++
+ 
+ #endif // AVCODEC_V4L2_BUFFERS_H
+diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
+index a40be94690..79a31cf930 100644
+--- a/libavcodec/v4l2_context.c
++++ b/libavcodec/v4l2_context.c
+@@ -27,11 +27,13 @@
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <poll.h>
++#include "libavutil/avassert.h"
+ #include "libavcodec/avcodec.h"
+ #include "decode.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
++#include "weak_link.h"
+ 
+ struct v4l2_format_update {
+     uint32_t v4l2_fmt;
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
+     int update_avfmt;
+ };
+ 
+-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+ {
+-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+-        container_of(ctx, V4L2m2mContext, output) :
+-        container_of(ctx, V4L2m2mContext, capture);
++    return (int64_t)n;
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+ {
+-    return ctx_to_m2mctx(ctx)->avctx;
++    return (unsigned int)pts;
++}
++
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
++{
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = avpkt->size,
++        .pts              = avpkt->pts,
++        .dts              = avpkt->dts,
++        .reordered_opaque = avctx->reordered_opaque,
++        .pkt_pos          = avpkt->pos,
++        .pkt_duration     = avpkt->duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
++{
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = 0,
++        .pts              = frame->pts,
++        .dts              = AV_NOPTS_VALUE,
++        .reordered_opaque = frame->reordered_opaque,
++        .pkt_pos          = frame->pkt_pos,
++        .pkt_duration     = frame->pkt_duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVFrame *const frame)
++{
++    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++    {
++        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        frame->pts              = AV_NOPTS_VALUE;
++        frame->pkt_dts          = AV_NOPTS_VALUE;
++        frame->reordered_opaque = x->last_opaque;
++        frame->pkt_pos          = -1;
++        frame->pkt_duration     = 0;
++        frame->pkt_size         = -1;
++    }
++    else if (!t->discard)
++    {
++        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
++        frame->pkt_dts          = t->dts;
++        frame->reordered_opaque = t->reordered_opaque;
++        frame->pkt_pos          = t->pkt_pos;
++        frame->pkt_duration     = t->pkt_duration;
++        frame->pkt_size         = t->pkt_size;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (frame->pts != AV_NOPTS_VALUE)
++            x->last_pts = frame->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        return -1;
++    }
++
++    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++    return 0;
+ }
+ 
+-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVPacket *const pkt)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++    {
++        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        pkt->pts                = AV_NOPTS_VALUE;
++    }
++    else if (!t->discard)
++    {
++        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (pkt->pts != AV_NOPTS_VALUE)
++            x->last_pts = pkt->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        return -1;
++    }
++
++    // * Would like something much better than this...xlat(offset + out_count)?
++    pkt->dts = pkt->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           pkt->pts, t->track_pts, n);
++    return 0;
+ }
+ 
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
++}
++
++static inline AVCodecContext *logger(const V4L2Context *ctx)
++{
++    return ctx_to_m2mctx(ctx)->avctx;
+ }
+ 
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
+     return sar;
+ }
+ 
+-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
++static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++    return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+ {
+-    struct v4l2_format *fmt1 = &ctx->format;
+-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+-        :
+-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++    const struct v4l2_format *fmt1 = &ctx->format;
++    int ret = !ctx_buffers_alloced(ctx) ||
++        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
++            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
++            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
++            :
++            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
++            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
+ 
+     if (ret)
+-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
++        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
+             ctx->name,
+-            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
+-            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
++            ctx_buffers_alloced(ctx),
++            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
++            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
+ 
+     return ret;
+ }
+@@ -153,76 +305,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
+     }
+ }
+ 
+-static int v4l2_start_decode(V4L2Context *ctx)
++static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
+ {
+-    struct v4l2_decoder_cmd cmd = {
+-        .cmd = V4L2_DEC_CMD_START,
+-        .flags = 0,
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++    struct v4l2_selection selection = {
++        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
++        .target = V4L2_SEL_TGT_COMPOSE
+     };
+-    int ret;
+ 
+-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd);
+-    if (ret)
++    memset(r, 0, sizeof(*r));
++    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
+         return AVERROR(errno);
+ 
++    *r = selection.r;
+     return 0;
+ }
+ 
+-/**
+- * handle resolution change event and end of stream event
+- * returns 1 if reinit was successful, negative if it failed
+- * returns 0 if reinit was not executed
+- */
+-static int v4l2_handle_event(V4L2Context *ctx)
++static int do_source_change(V4L2m2mContext * const s)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+-    struct v4l2_format cap_fmt = s->capture.format;
+-    struct v4l2_event evt = { 0 };
++    AVCodecContext *const avctx = s->avctx;
++
+     int ret;
++    int reinit;
++    struct v4l2_format cap_fmt = s->capture.format;
+ 
+-    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
+-    if (ret < 0) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
+-        return 0;
+-    }
++    s->capture.done = 0;
+ 
+-    if (evt.type == V4L2_EVENT_EOS) {
+-        ctx->done = 1;
++    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
++    if (ret) {
++        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
+         return 0;
+     }
+ 
+-    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
+-        return 0;
++    get_default_selection(&s->capture, &s->capture.selection);
+ 
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+-    if (ret) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
+-        return 0;
++    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
++    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
++        reinit = 1;
++
++    s->capture.format = cap_fmt;
++    if (reinit) {
++        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
++        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
+     }
+ 
+-    if (v4l2_resolution_changed(&s->capture, &cap_fmt)) {
+-        s->capture.height = v4l2_get_height(&cap_fmt);
+-        s->capture.width = v4l2_get_width(&cap_fmt);
+-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+-    } else {
+-        v4l2_start_decode(ctx);
+-        return 0;
++    // If we don't support selection (or it is bust) and we obviously have HD then kludge
++    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
++        (s->capture.height == 1088 && s->capture.width == 1920)) {
++        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+     }
+ 
+-    s->reinit = 1;
++    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
+ 
+-    if (s->avctx)
+-        ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
+-    if (ret < 0)
+-        av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
++    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
++           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
++           s->capture.width, s->capture.height,
++           s->capture.selection.width, s->capture.selection.height,
++           s->capture.selection.left, s->capture.selection.top, reinit);
+ 
+-    ret = ff_v4l2_m2m_codec_reinit(s);
+-    if (ret) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
+-        return AVERROR(EINVAL);
++    if (reinit) {
++        if (avctx)
++            ret = ff_set_dimensions(s->avctx,
++                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
++                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
++        if (ret < 0)
++            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
++
++        ret = ff_v4l2_m2m_codec_reinit(s);
++        if (ret) {
++            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
++            return AVERROR(EINVAL);
++        }
++
++        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
++            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
++                   s->capture.width, s->capture.height,
++                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
++            return AVERROR(EINVAL);
++        }
++
++        // Update pixel format - should only actually do something on initial change
++        s->capture.av_pix_fmt =
++            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
++        if (s->output_drm) {
++            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
++        }
++        else
++            avctx->pix_fmt = s->capture.av_pix_fmt;
++
++        goto reinit_run;
+     }
+ 
++    /* Buffers are OK so just stream off to ack */
++    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
++
++    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++    if (ret)
++        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
++    s->draining = 0;
++
+     /* reinit executed */
++reinit_run:
++    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
+     return 1;
+ }
+ 
+@@ -266,171 +452,293 @@ static int v4l2_stop_encode(V4L2Context *ctx)
+     return 0;
+ }
+ 
+-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
++// DQ a buffer
++// Amalgamates all the various ways there are of signalling EOS/Event to
++// generate a consistant EPIPE.
++//
++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
++//
++// Returns:
++//  0               Success
++//  AVERROR(EPIPE)  Nothing more to read
++//  AVERROR(ENOSPC) No buffers in Q to put result in
++//  *               AVERROR(..)
++
++ static int
++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
+ {
+-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+-    struct v4l2_buffer buf = { 0 };
+-    V4L2Buffer *avbuf;
+-    struct pollfd pfd = {
+-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+-        .fd = ctx_to_m2mctx(ctx)->fd,
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    V4L2Buffer * avbuf;
++    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
++
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++    struct v4l2_buffer buf = {
++        .type = ctx->type,
++        .memory = V4L2_MEMORY_MMAP,
+     };
+-    int i, ret;
+ 
+-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
+-        for (i = 0; i < ctx->num_buffers; i++) {
+-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+-                break;
+-        }
+-        if (i == ctx->num_buffers)
+-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
+-                                                "userspace. Increase num_capture_buffers "
+-                                                "to prevent device deadlock or dropped "
+-                                                "packets/frames.\n");
+-    }
+-
+-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
+-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+-        for (i = 0; i < ctx->num_buffers; i++) {
+-            /* capture buffer initialization happens during decode hence
+-             * detection happens at runtime
+-             */
+-            if (!ctx->buffers)
+-                break;
+-
+-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+-                goto start;
++    *ppavbuf = NULL;
++
++    if (ctx->flag_last)
++        return AVERROR(EPIPE);
++
++    if (is_mp) {
++        buf.length = VIDEO_MAX_PLANES;
++        buf.m.planes = planes;
++    }
++
++    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
++        const int err = errno;
++        av_assert0(AVERROR(err) < 0);
++        if (err != EINTR) {
++            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
++                ctx->name, av_err2str(AVERROR(err)));
++
++            if (err == EPIPE)
++                ctx->flag_last = 1;
++
++            return AVERROR(err);
+         }
+-        ctx->done = 1;
+-        return NULL;
+     }
++    atomic_fetch_sub(&ctx->q_count, 1);
++
++    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
++    ff_v4l2_buffer_set_avail(avbuf);
++    avbuf->buf = buf;
++    if (is_mp) {
++        memcpy(avbuf->planes, planes, sizeof(planes));
++        avbuf->buf.m.planes = avbuf->planes;
++    }
++    // Done with any attached buffer
++    av_buffer_unref(&avbuf->ref_buf);
+ 
+-start:
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+-        pfd.events =  POLLOUT | POLLWRNORM;
+-    else {
+-        /* no need to listen to requests for more input while draining */
+-        if (ctx_to_m2mctx(ctx)->draining)
+-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
++    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
++        // Zero length cap buffer return == EOS
++        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
++
++            // Must reQ so we don't leak
++            // May not matter if the next thing we do is release all the
++            // buffers but better to be tidy.
++            ff_v4l2_buffer_enqueue(avbuf);
++
++            ctx->flag_last = 1;
++            return AVERROR(EPIPE);
++        }
++
++#ifdef V4L2_BUF_FLAG_LAST
++        // If flag_last set then this contains data but is the last frame
++        // so remember that but return OK
++        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
++            ctx->flag_last = 1;
++#endif
+     }
+ 
+-    for (;;) {
+-        ret = poll(&pfd, 1, timeout);
+-        if (ret > 0)
+-            break;
+-        if (errno == EINTR)
++    *ppavbuf = avbuf;
++    return 0;
++}
++
++/**
++ * handle resolution change event and end of stream event
++ * Expects to be called after the stream has stopped
++ *
++ * returns 1 if reinit was successful, negative if it failed
++ * returns 0 if reinit was not executed
++ */
++static int
++get_event(V4L2m2mContext * const m)
++{
++    AVCodecContext * const avctx = m->avctx;
++    struct v4l2_event evt = { 0 };
++
++    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
++        const int rv = AVERROR(errno);
++        if (rv == AVERROR(EINTR))
+             continue;
+-        return NULL;
++        if (rv == AVERROR(EAGAIN)) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
++            return AVERROR_EOF;
++        }
++        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
++        return rv;
+     }
+ 
+-    /* 0. handle errors */
+-    if (pfd.revents & POLLERR) {
+-        /* if we are trying to get free buffers but none have been queued yet
+-           no need to raise a warning */
+-        if (timeout == 0) {
+-            for (i = 0; i < ctx->num_buffers; i++) {
+-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+-            }
+-        }
+-        else
+-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
++    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+ 
+-        return NULL;
++    if (evt.type == V4L2_EVENT_EOS) {
++        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
++        return AVERROR_EOF;
+     }
+ 
+-    /* 1. handle resolution changes */
+-    if (pfd.revents & POLLPRI) {
+-        ret = v4l2_handle_event(ctx);
+-        if (ret < 0) {
+-            /* if re-init failed, abort */
+-            ctx->done = 1;
+-            return NULL;
+-        }
+-        if (ret) {
+-            /* if re-init was successful drop the buffer (if there was one)
+-             * since we had to reconfigure capture (unmap all buffers)
+-             */
+-            return NULL;
++    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
++        return do_source_change(m);
++
++    return 0;
++}
++
++static inline int
++dq_ok(const V4L2Context * const c)
++{
++    return c->streamon && atomic_load(&c->q_count) != 0;
++}
++
++// Get a buffer
++// If output then just gets the buffer in the expected way
++// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
++
++static int
++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
++{
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
++
++    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
++    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
++    const unsigned int poll_event = POLLPRI;
++
++    *ppavbuf = NULL;
++
++    for (;;) {
++        struct pollfd pfd = {
++            .fd = m->fd,
++            // If capture && stream not started then assume we are waiting for the initial event
++            .events = !is_cap ? poll_out :
++                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
++                    poll_event,
++        };
++        int ret;
++
++        if (ctx->done) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
++            return AVERROR_EOF;
+         }
+-    }
+ 
+-    /* 2. dequeue the buffer */
+-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
++        // If capture && timeout == -1 then also wait for rx buffer free
++        if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining)
++            pfd.events |= poll_out;
+ 
+-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            /* there is a capture buffer ready */
+-            if (pfd.revents & (POLLIN | POLLRDNORM))
+-                goto dequeue;
++        // If nothing Qed all we will get is POLLERR - avoid that
++        if ((pfd.events == poll_out && !dq_ok(&m->output)) ||
++            (pfd.events == poll_cap && !dq_ok(&m->capture)) ||
++            (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
++            return AVERROR(ENOSPC);
++        }
+ 
+-            /* the driver is ready to accept more input; instead of waiting for the capture
+-             * buffer to complete we return NULL so input can proceed (we are single threaded)
+-             */
+-            if (pfd.revents & (POLLOUT | POLLWRNORM))
+-                return NULL;
++        // Timeout kludged s.t. "forever" eventually gives up & produces logging
++        // If waiting for an event when we have seen a last_frame then we expect
++        //   it to be ready already so force a short timeout
++        ret = poll(&pfd, 1,
++                   ff_v4l2_ctx_eos(ctx) ? 10 :
++                   timeout == -1 ? 3000 : timeout);
++        if (ret < 0) {
++            ret = AVERROR(errno);  // Remember errno before logging etc.
++            av_assert0(ret < 0);
+         }
+ 
+-dequeue:
+-        memset(&buf, 0, sizeof(buf));
+-        buf.memory = V4L2_MEMORY_MMAP;
+-        buf.type = ctx->type;
+-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+-            memset(planes, 0, sizeof(planes));
+-            buf.length = VIDEO_MAX_PLANES;
+-            buf.m.planes = planes;
++        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++               ctx->name, ret, timeout, pfd.events, pfd.revents);
++
++        if (ret < 0) {
++            if (ret == AVERROR(EINTR))
++                continue;
++            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++            return ret;
+         }
+ 
+-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
+-        if (ret) {
+-            if (errno != EAGAIN) {
+-                ctx->done = 1;
+-                if (errno != EPIPE)
+-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+-                        ctx->name, av_err2str(AVERROR(errno)));
++        if (ret == 0) {
++            if (timeout == -1)
++                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
++            if (ff_v4l2_ctx_eos(ctx)) {
++                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
++                ret = get_event(m);
++                if (ret < 0) {
++                    ctx->done = 1;
++                    return ret;
++                }
+             }
+-            return NULL;
++            return AVERROR(EAGAIN);
++        }
++
++        if ((pfd.revents & POLLERR) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++            return AVERROR_UNKNOWN;
+         }
+ 
+-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
+-                            buf.m.planes[0].bytesused : buf.bytesused;
+-            if (bytesused == 0) {
++        if ((pfd.revents & poll_event) != 0) {
++            ret = get_event(m);
++            if (ret < 0) {
+                 ctx->done = 1;
+-                return NULL;
++                return ret;
+             }
+-#ifdef V4L2_BUF_FLAG_LAST
+-            if (buf.flags & V4L2_BUF_FLAG_LAST)
+-                ctx->done = 1;
+-#endif
++            continue;
++        }
++
++        if ((pfd.revents & poll_cap) != 0) {
++            ret = dq_buf(ctx, ppavbuf);
++            if (ret == AVERROR(EPIPE))
++                continue;
++            return ret;
+         }
+ 
+-        avbuf = &ctx->buffers[buf.index];
+-        avbuf->status = V4L2BUF_AVAILABLE;
+-        avbuf->buf = buf;
+-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+-            memcpy(avbuf->planes, planes, sizeof(planes));
+-            avbuf->buf.m.planes = avbuf->planes;
++        if ((pfd.revents & poll_out) != 0) {
++            if (is_cap)
++                return AVERROR(EAGAIN);
++            return dq_buf(ctx, ppavbuf);
+         }
+-        return avbuf;
++
++        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
++        return AVERROR_UNKNOWN;
+     }
++}
+ 
+-    return NULL;
++// Clear out flags and timestamps that should should be set by the user
++// Returns the passed avbuf
++static V4L2Buffer *
++clean_v4l2_buffer(V4L2Buffer * const avbuf)
++{
++    struct v4l2_buffer *const buf = &avbuf->buf;
++
++    buf->flags = 0;
++    buf->field = V4L2_FIELD_ANY;
++    buf->timestamp = (struct timeval){0};
++    buf->timecode = (struct v4l2_timecode){0};
++    buf->sequence = 0;
++
++    return avbuf;
++}
++
++int
++ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1)
++{
++    V4L2Buffer * avbuf;
++    if (timeout1 != 0) {
++        int rv = get_qbuf(ctx, &avbuf, timeout1);
++        if (rv != 0)
++            return rv;
++    }
++    do {
++        get_qbuf(ctx, &avbuf, 0);
++    } while (avbuf);
++    return 0;
+ }
+ 
+ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ {
+-    int timeout = 0; /* return when no more buffers to dequeue */
+     int i;
+ 
+     /* get back as many output buffers as possible */
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-          do {
+-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
+-    }
++    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
++        ff_v4l2_dq_all(ctx, 0);
+ 
+     for (i = 0; i < ctx->num_buffers; i++) {
+-        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
+-            return &ctx->buffers[i];
++        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
++        if (avbuf->status == V4L2BUF_AVAILABLE)
++            return clean_v4l2_buffer(avbuf);
+     }
+ 
+     return NULL;
+@@ -438,25 +746,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ 
+ static int v4l2_release_buffers(V4L2Context* ctx)
+ {
+-    struct v4l2_requestbuffers req = {
+-        .memory = V4L2_MEMORY_MMAP,
+-        .type = ctx->type,
+-        .count = 0, /* 0 -> unmaps buffers from the driver */
+-    };
+-    int i, j;
++    int i;
++    int ret = 0;
++    const int fd = ctx_to_m2mctx(ctx)->fd;
+ 
+-    for (i = 0; i < ctx->num_buffers; i++) {
+-        V4L2Buffer *buffer = &ctx->buffers[i];
++    // Orphan any buffers in the wild
++    ff_weak_link_break(&ctx->wl_master);
++
++    if (ctx->bufrefs) {
++        for (i = 0; i < ctx->num_buffers; i++)
++            av_buffer_unref(ctx->bufrefs + i);
++    }
++
++    if (fd != -1) {
++        struct v4l2_requestbuffers req = {
++            .memory = V4L2_MEMORY_MMAP,
++            .type = ctx->type,
++            .count = 0, /* 0 -> unmap all buffers from the driver */
++        };
++
++        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++            if (errno == EINTR)
++                continue;
+ 
+-        for (j = 0; j < buffer->num_planes; j++) {
+-            struct V4L2Plane_info *p = &buffer->plane_info[j];
+-            if (p->mm_addr && p->length)
+-                if (munmap(p->mm_addr, p->length) < 0)
+-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++            ret = AVERROR(errno);
++
++            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
++                ctx->name, av_err2str(AVERROR(errno)));
++
++            if (ctx_to_m2mctx(ctx)->output_drm)
++                av_log(logger(ctx), AV_LOG_ERROR,
++                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
++                    "for all buffers: \n"
++                    "  1. drmModeRmFB(..)\n"
++                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
+         }
+     }
++    atomic_store(&ctx->q_count, 0);
+ 
+-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
++    return ret;
+ }
+ 
+ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
+@@ -485,6 +813,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
+ 
+ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+ {
++    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
++    V4L2m2mPriv *priv = s->avctx->priv_data;
+     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
+     struct v4l2_fmtdesc fdesc;
+     int ret;
+@@ -498,21 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+             return 0;
+     }
+ 
+-    for (;;) {
++    for (;; ++fdesc.index) {
+         ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
+         if (ret)
+             return AVERROR(EINVAL);
+ 
++        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
++            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt))
++                continue;
++        }
++
+         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
+         ret = v4l2_try_raw_format(ctx, pixfmt);
+-        if (ret){
+-            fdesc.index++;
+-            continue;
++        if (ret == 0) {
++            *p = pixfmt;
++            return 0;
+         }
+-
+-        *p = pixfmt;
+-
+-        return 0;
+     }
+ 
+     return AVERROR(EINVAL);
+@@ -555,30 +886,99 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
+   *
+   *****************************************************************************/
+ 
++
++static void flush_all_buffers_status(V4L2Context* const ctx)
++{
++    int i;
++
++    if (!ctx->bufrefs)
++        return;
++
++    for (i = 0; i < ctx->num_buffers; ++i) {
++        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++        if (buf->status == V4L2BUF_IN_DRIVER)
++            ff_v4l2_buffer_set_avail(buf);
++    }
++    atomic_store(&ctx->q_count, 0);
++}
++
++static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
++{
++    int i;
++    int rv;
++
++    if (!ctx->bufrefs) {
++        rv = ff_v4l2_context_init(ctx);
++        if (rv) {
++            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
++            return rv;
++        }
++    }
++
++    for (i = 0; i < ctx->num_buffers; ++i) {
++        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++        if (buf->status == V4L2BUF_AVAILABLE) {
++            rv = ff_v4l2_buffer_enqueue(buf);
++            if (rv < 0)
++                return rv;
++        }
++    }
++    return 0;
++}
++
+ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
+ {
+     int type = ctx->type;
+-    int ret;
++    int ret = 0;
++    AVCodecContext * const avctx = logger(ctx);
+ 
+-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+-    if (ret < 0)
+-        return AVERROR(errno);
++    // Avoid doing anything if there is nothing we can do
++    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
++        return 0;
+ 
+-    ctx->streamon = (cmd == VIDIOC_STREAMON);
++    ff_mutex_lock(&ctx->lock);
+ 
+-    return 0;
++    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
++        stuff_all_buffers(avctx, ctx);
++
++    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
++        const int err = errno;
++        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
++               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
++        ret = AVERROR(err);
++    }
++    else
++    {
++        if (cmd == VIDIOC_STREAMOFF)
++            flush_all_buffers_status(ctx);
++        else
++            ctx->first_buf = 1;
++
++        ctx->streamon = (cmd == VIDIOC_STREAMON);
++        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
++               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
++    }
++
++    // Both stream off & on effectively clear flag_last
++    ctx->flag_last = 0;
++
++    ff_mutex_unlock(&ctx->lock);
++
++    return ret;
+ }
+ 
+ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
++    int64_t track_ts;
+     V4L2Buffer* avbuf;
+     int ret;
+ 
+     if (!frame) {
+         ret = v4l2_stop_encode(ctx);
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+         s->draining= 1;
+         return 0;
+     }
+@@ -587,23 +987,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+     if (!avbuf)
+         return AVERROR(EAGAIN);
+ 
+-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+     if (ret)
+         return ret;
+ 
+     return ff_v4l2_buffer_enqueue(avbuf);
+ }
+ 
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
++                                   const void * extdata, size_t extlen)
+ {
+     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer* avbuf;
+     int ret;
++    int64_t track_ts;
+ 
+     if (!pkt->size) {
+         ret = v4l2_stop_decode(ctx);
++        // Log but otherwise ignore stop failure
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
+         s->draining = 1;
+         return 0;
+     }
+@@ -612,8 +1018,13 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+     if (!avbuf)
+         return AVERROR(EAGAIN);
+ 
+-    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+-    if (ret)
++    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
++    if (ret == AVERROR(ENOMEM))
++        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
++               __func__, pkt->size, avbuf->planes[0].length);
++    else if (ret)
+         return ret;
+ 
+     return ff_v4l2_buffer_enqueue(avbuf);
+@@ -621,42 +1032,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+ 
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+ {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer *avbuf;
++    int rv;
+ 
+-    /*
+-     * timeout=-1 blocks until:
+-     *  1. decoded frame available
+-     *  2. an input buffer is ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
+-
+-        return AVERROR(EAGAIN);
+-    }
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv;
++        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
+ 
+-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
++   return 0;
+ }
+ 
+-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
+ {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer *avbuf;
++    int rv;
+ 
+-    /*
+-     * blocks until:
+-     *  1. encoded packet available
+-     *  2. an input buffer ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
++        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
+ 
+-        return AVERROR(EAGAIN);
+-    }
+-
+-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++    return 0;
+ }
+ 
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -688,78 +1093,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+ 
+ int ff_v4l2_context_set_format(V4L2Context* ctx)
+ {
+-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++    int ret;
++
++    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++    if (ret != 0)
++        return ret;
++
++    // Check returned size against min size and if smaller have another go
++    // Only worry about plane[0] as this is meant to enforce limits for
++    // encoded streams where we might know a bit more about the shape
++    // than the driver
++    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
++            return 0;
++        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
++    }
++    else {
++        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
++            return 0;
++        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
++    }
++
++    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++    return ret;
+ }
+ 
+ void ff_v4l2_context_release(V4L2Context* ctx)
+ {
+     int ret;
+ 
+-    if (!ctx->buffers)
++    if (!ctx->bufrefs)
+         return;
+ 
+     ret = v4l2_release_buffers(ctx);
+     if (ret)
+         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
+ 
+-    av_freep(&ctx->buffers);
++    av_freep(&ctx->bufrefs);
++    av_buffer_unref(&ctx->frames_ref);
++
++    ff_mutex_destroy(&ctx->lock);
++    pthread_cond_destroy(&ctx->cond);
+ }
+ 
+-int ff_v4l2_context_init(V4L2Context* ctx)
++
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+     struct v4l2_requestbuffers req;
+-    int ret, i;
+-
+-    if (!v4l2_type_supported(ctx)) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+-        return AVERROR_PATCHWELCOME;
+-    }
++    int ret;
++    int i;
+ 
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+-    if (ret)
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
++    av_assert0(ctx->bufrefs == NULL);
+ 
+     memset(&req, 0, sizeof(req));
+-    req.count = ctx->num_buffers;
+-    req.memory = V4L2_MEMORY_MMAP;
++    req.count = req_buffers;
++    req.memory = mem;
+     req.type = ctx->type;
+-    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
+-    if (ret < 0) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
+-        return AVERROR(errno);
++    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
++        if (errno != EINTR) {
++            ret = AVERROR(errno);
++            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
++            return ret;
++        }
+     }
+ 
+     ctx->num_buffers = req.count;
+-    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
+-    if (!ctx->buffers) {
++    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
++    if (!ctx->bufrefs) {
+         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
+-        return AVERROR(ENOMEM);
++        goto fail_release;
+     }
+ 
+-    for (i = 0; i < req.count; i++) {
+-        ctx->buffers[i].context = ctx;
+-        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
+-        if (ret < 0) {
++    ctx->wl_master = ff_weak_link_new(ctx);
++    if (!ctx->wl_master) {
++        ret = AVERROR(ENOMEM);
++        goto fail_release;
++    }
++
++    for (i = 0; i < ctx->num_buffers; i++) {
++        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
++        if (ret) {
+             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
+-            goto error;
++            goto fail_release;
+         }
+     }
+ 
+     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
+         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
+         req.count,
+-        v4l2_get_width(&ctx->format),
+-        v4l2_get_height(&ctx->format),
++        ff_v4l2_get_format_width(&ctx->format),
++        ff_v4l2_get_format_height(&ctx->format),
+         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
+         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
+ 
+     return 0;
+ 
+-error:
++fail_release:
+     v4l2_release_buffers(ctx);
++    av_freep(&ctx->bufrefs);
++    return ret;
++}
++
++int ff_v4l2_context_init(V4L2Context* ctx)
++{
++    struct v4l2_queryctrl qctrl;
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++    int ret;
++
++    // It is not valid to reinit a context without a previous release
++    av_assert0(ctx->bufrefs == NULL);
++
++    if (!v4l2_type_supported(ctx)) {
++        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
++        return AVERROR_PATCHWELCOME;
++    }
++
++    ff_mutex_init(&ctx->lock, NULL);
++    pthread_cond_init(&ctx->cond, NULL);
++    atomic_init(&ctx->q_count, 0);
++
++    if (s->output_drm) {
++        AVHWFramesContext *hwframes;
++
++        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
++        if (!ctx->frames_ref) {
++            ret = AVERROR(ENOMEM);
++            goto fail_unlock;
++        }
+ 
+-    av_freep(&ctx->buffers);
++        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
++        hwframes->format = AV_PIX_FMT_DRM_PRIME;
++        hwframes->sw_format = ctx->av_pix_fmt;
++        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
++        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
++        ret = av_hwframe_ctx_init(ctx->frames_ref);
++        if (ret < 0)
++            goto fail_unref_hwframes;
++    }
++
++    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
++        goto fail_unref_hwframes;
++    }
++
++    memset(&qctrl, 0, sizeof(qctrl));
++    qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT;
++    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) {
++        ret = AVERROR(errno);
++        if (ret != AVERROR(EINVAL)) {
++            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret));
++            goto fail_unref_hwframes;
++        }
++        // Control unsupported - set default if wanted
++        if (ctx->num_buffers < 2)
++            ctx->num_buffers = 4;
++    }
++    else {
++        if (ctx->num_buffers < 2)
++            ctx->num_buffers = qctrl.minimum + 2;
++        ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum);
++    }
++
++    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
++    if (ret < 0)
++        goto fail_unref_hwframes;
++
++    return 0;
+ 
++fail_unref_hwframes:
++    av_buffer_unref(&ctx->frames_ref);
++fail_unlock:
++    ff_mutex_destroy(&ctx->lock);
+     return ret;
+ }
+diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
+index 6f7460c89a..5afed3e6ec 100644
+--- a/libavcodec/v4l2_context.h
++++ b/libavcodec/v4l2_context.h
+@@ -32,6 +32,8 @@
+ #include "libavutil/rational.h"
+ #include "codec_id.h"
+ #include "packet.h"
++#include "libavutil/buffer.h"
++#include "libavutil/thread.h"
+ #include "v4l2_buffers.h"
+ 
+ typedef struct V4L2Context {
+@@ -71,28 +73,57 @@ typedef struct V4L2Context {
+      */
+     int width, height;
+     AVRational sample_aspect_ratio;
++    struct v4l2_rect selection;
+ 
+     /**
+-     * Indexed array of V4L2Buffers
++     * If the default size of buffer is less than this then try to
++     * set to this.
+      */
+-    V4L2Buffer *buffers;
++    uint32_t min_buf_size;
++
++    /**
++     * Indexed array of pointers to V4L2Buffers
++     */
++    AVBufferRef **bufrefs;
+ 
+     /**
+      * Readonly after init.
+      */
+     int num_buffers;
+ 
++    /**
++     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++     */
++    enum v4l2_memory buf_mem;
++
+     /**
+      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+      */
+     int streamon;
+ 
++    /* 1st buffer after stream on */
++    int first_buf;
++
+     /**
+      *  Either no more buffers available or an unrecoverable error was notified
+      *  by the V4L2 kernel driver: once set the context has to be exited.
+      */
+     int done;
+ 
++    int flag_last;
++
++    /**
++     * If NZ then when Qing frame/pkt use this rather than the
++     * "real" PTS
++     */
++    uint64_t track_ts;
++
++    AVBufferRef *frames_ref;
++    atomic_int q_count;
++    struct ff_weak_link_master *wl_master;
++
++    AVMutex lock;
++    pthread_cond_t cond;
+ } V4L2Context;
+ 
+ /**
+@@ -148,7 +179,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
+  * @param[inout] pkt The AVPacket to dequeue to.
+  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+  */
+-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout);
+ 
+ /**
+  * Dequeues a buffer from a V4L2Context to an AVFrame.
+@@ -157,7 +188,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+  * @param[in] ctx The V4L2Context to dequeue from.
+  * @param[inout] f The AVFrame to dequeue to.
+  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
++ *
+  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ *                AVERROR(ENOSPC) if no buffer availible to put
++ *                the frame in
+  */
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+ 
+@@ -171,7 +205,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+  * @param[in] pkt A pointer to an AVPacket.
+  * @return 0 in case of success, a negative error otherwise.
+  */
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
+ 
+ /**
+  * Enqueues a buffer to a V4L2Context from an AVFrame
+@@ -184,4 +218,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+  */
+ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
+ 
++/**
++ * Dequeue all buffers on this queue
++ *
++ * Used to recycle output buffers
++ *
++ * @param[in] ctx The V4L2Context to dequeue from.
++ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, 
++ *       all others have a timeout of zero
++ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return
++ *         of the first dequeue operation, 0 otherwise.
++ */
++int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1);
++
++/**
++ * Returns the number of buffers currently queued
++ *
++ * @param[in] ctx The V4L2Context to evaluate
++ */
++static inline int
++ff_v4l2_context_q_count(const V4L2Context* const ctx)
++{
++    return atomic_load(&ctx->q_count);
++}
++
+ #endif // AVCODEC_V4L2_CONTEXT_H
+diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
+index 602efb7a16..28d9ed4988 100644
+--- a/libavcodec/v4l2_m2m.c
++++ b/libavcodec/v4l2_m2m.c
+@@ -34,6 +34,15 @@
+ #include "v4l2_context.h"
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
++#include "v4l2_req_dmabufs.h"
++
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
+ 
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+@@ -67,7 +76,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
+ 
+     s->capture.done = s->output.done = 0;
+     s->capture.name = "capture";
++    s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     s->output.name = "output";
++    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     atomic_init(&s->refcount, 0);
+     sem_init(&s->refsync, 0, 0);
+ 
+@@ -84,18 +95,58 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
+     if (v4l2_mplane_video(&cap)) {
+         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++        s->output.format.type = s->output.type;
+         return 0;
+     }
+ 
+     if (v4l2_splane_video(&cap)) {
+         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++        s->output.format.type = s->output.type;
+         return 0;
+     }
+ 
+     return AVERROR(EINVAL);
+ }
+ 
++static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_format fmt = {.type = s->output.type};
++    int rv;
++    uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt);
++    unsigned int w;
++    unsigned int h;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        fmt.fmt.pix_mp.pixelformat = pixfmt;
++        fmt.fmt.pix_mp.width = avctx->width;
++        fmt.fmt.pix_mp.height = avctx->height;
++    }
++    else {
++        fmt.fmt.pix.pixelformat = pixfmt;
++        fmt.fmt.pix.width = avctx->width;
++        fmt.fmt.pix.height = avctx->height;
++    }
++
++    rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt);
++
++    if (rv != 0) {
++        rv = AVERROR(errno);
++        av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv));
++        return rv;
++    }
++
++    w = ff_v4l2_get_format_width(&fmt);
++    h = ff_v4l2_get_format_height(&fmt);
++
++    if (w < avctx->width || h < avctx->height) {
++        av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h);
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
+ static int v4l2_probe_driver(V4L2m2mContext *s)
+ {
+     void *log_ctx = s->avctx;
+@@ -115,6 +166,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s)
+         goto done;
+     }
+ 
++    // If being given frames (encode) check that V4L2 can cope with the size
++    if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO &&
++        (ret = check_size(s->avctx, s)) != 0)
++        goto done;
++
+     ret = ff_v4l2_context_get_format(&s->capture, 1);
+     if (ret) {
+         av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
+@@ -216,13 +272,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
+         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
+ 
+     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
+-     *    we must wait for all references to be released before being allowed
+-     *    to queue new buffers.
+      */
+-    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
+-    if (atomic_load(&s->refcount))
+-        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
+-
+     ff_v4l2_context_release(&s->capture);
+ 
+     /* 3. get the new capture format */
+@@ -241,7 +291,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
+ 
+     /* 5. complete reinit */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     return 0;
+ }
+@@ -258,6 +307,9 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
+     av_frame_unref(s->frame);
+     av_frame_free(&s->frame);
+     av_packet_unref(&s->buf_pkt);
++    av_freep(&s->extdata_data);
++
++    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
+ 
+     av_free(s);
+ }
+@@ -270,6 +322,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+     if (!s)
+         return 0;
+ 
++    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
++
++    if (s->avctx && av_codec_is_decoder(s->avctx->codec))
++        av_packet_unref(&s->buf_pkt);
++
+     if (s->fd >= 0) {
+         ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+         if (ret)
+@@ -282,7 +339,15 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+ 
+     ff_v4l2_context_release(&s->output);
+ 
++    dmabufs_ctl_unref(&s->db_ctl);
++    close(s->fd);
++    s->fd = -1;
++
+     s->self_ref = NULL;
++    // This is only called on avctx close so after this point we don't have that
++    // Crash sooner if we find we are using it (can still log with avctx = NULL)
++    s->avctx = NULL;
++    priv->context = NULL;
+     av_buffer_unref(&priv->context_ref);
+ 
+     return 0;
+@@ -326,35 +391,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
+     return v4l2_configure_contexts(s);
+ }
+ 
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+-    *s = av_mallocz(sizeof(V4L2m2mContext));
+-    if (!*s)
++    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++    *pps = NULL;
++    if (!s)
+         return AVERROR(ENOMEM);
+ 
+-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+                                          &v4l2_m2m_destroy_context, NULL, 0);
+     if (!priv->context_ref) {
+-        av_freep(s);
++        av_free(s);
+         return AVERROR(ENOMEM);
+     }
+ 
+     /* assign the context */
+-    priv->context = *s;
+-    (*s)->priv = priv;
++    priv->context = s;
++    s->priv = priv;
+ 
+     /* populate it */
+-    priv->context->capture.num_buffers = priv->num_capture_buffers;
+-    priv->context->output.num_buffers  = priv->num_output_buffers;
+-    priv->context->self_ref = priv->context_ref;
+-    priv->context->fd = -1;
++    s->capture.num_buffers = priv->num_capture_buffers;
++    s->output.num_buffers  = priv->num_output_buffers;
++    s->self_ref = priv->context_ref;
++    s->fd = -1;
++    xlat_init(&s->xlat);
+ 
+     priv->context->frame = av_frame_alloc();
+     if (!priv->context->frame) {
+         av_buffer_unref(&priv->context_ref);
+-        *s = NULL; /* freed when unreferencing context_ref */
+         return AVERROR(ENOMEM);
+     }
+ 
++    *pps = s;
+     return 0;
+ }
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index 04d86d7b92..a506e69d67 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -30,6 +30,7 @@
+ #include <linux/videodev2.h>
+ 
+ #include "libavcodec/avcodec.h"
++#include "libavutil/pixfmt.h"
+ #include "v4l2_context.h"
+ 
+ #define container_of(ptr, type, member) ({ \
+@@ -40,6 +41,38 @@
+     { "num_output_buffers", "Number of buffers in the output context",\
+         OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
+ 
++#define FF_V4L2_M2M_TRACK_SIZE 128
++typedef struct V4L2m2mTrackEl {
++    int     discard;   // If we see this buffer its been flushed, so discard
++    int     pending;
++    int     pkt_size;
++    int64_t pts;
++    int64_t dts;
++    int64_t reordered_opaque;
++    int64_t pkt_pos;
++    int64_t pkt_duration;
++    int64_t track_pts;
++} V4L2m2mTrackEl;
++
++typedef struct pts_stats_s
++{
++    void * logctx;
++    const char * name;  // For debug
++    unsigned int last_count;
++    unsigned int last_interval;
++    int64_t last_pts;
++    int64_t guess;
++} pts_stats_t;
++
++typedef struct xlat_track_s {
++    unsigned int track_no;
++    int64_t last_pts;    // Last valid PTS decoded
++    int64_t last_opaque;
++    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++} xlat_track_t;
++
++struct dmabufs_ctl;
++
+ typedef struct V4L2m2mContext {
+     char devname[PATH_MAX];
+     int fd;
+@@ -52,10 +85,10 @@ typedef struct V4L2m2mContext {
+     AVCodecContext *avctx;
+     sem_t refsync;
+     atomic_uint refcount;
+-    int reinit;
+ 
+     /* null frame/packet received */
+     int draining;
++    int running;
+     AVPacket buf_pkt;
+ 
+     /* Reference to a frame. Only used during encoding */
+@@ -66,6 +99,36 @@ typedef struct V4L2m2mContext {
+ 
+     /* reference back to V4L2m2mPriv */
+     void *priv;
++
++    AVBufferRef *device_ref;
++
++    /* generate DRM frames */
++    int output_drm;
++
++    /* input frames are drmprime */
++    int input_drm;
++
++    /* Frame tracking */
++    xlat_track_t xlat;
++
++    pts_stats_t pts_stat;
++
++    /* req pkt */
++    int req_pkt;
++    int reorder_size;
++
++    /* Ext data sent */
++    int extdata_sent;
++    /* Ext data sent in packet - overrides ctx */
++    void * extdata_data;
++    size_t extdata_size;
++
++#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
++    /* Quirks */
++    unsigned int quirks;
++
++    struct dmabufs_ctl * db_ctl;
+ } V4L2m2mContext;
+ 
+ typedef struct V4L2m2mPriv {
+@@ -76,6 +139,8 @@ typedef struct V4L2m2mPriv {
+ 
+     int num_output_buffers;
+     int num_capture_buffers;
++    const char * dmabuf_alloc;
++    enum AVPixelFormat pix_fmt;
+ } V4L2m2mPriv;
+ 
+ /**
+@@ -129,4 +194,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
+  */
+ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
+ 
++
++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++}
++
++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++}
++
++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
++{
++    return ctx->flag_last;
++}
++
++
+ #endif /* AVCODEC_V4L2_M2M_H */
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index 4944d08511..11c83b2d66 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -21,8 +21,14 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "config_components.h"
++
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/pixdesc.h"
+ #include "libavutil/opt.h"
+@@ -30,75 +36,279 @@
+ #include "codec_internal.h"
+ #include "libavcodec/decode.h"
+ 
++#include "libavcodec/hwaccels.h"
++#include "libavcodec/internal.h"
++#include "libavcodec/hwconfig.h"
++
+ #include "v4l2_context.h"
+ #include "v4l2_m2m.h"
+ #include "v4l2_fmt.h"
++#include "v4l2_req_dmabufs.h"
+ 
+-static int v4l2_try_start(AVCodecContext *avctx)
++#if CONFIG_H264_DECODER
++#include "h264_parse.h"
++#endif
++#if CONFIG_HEVC_DECODER
++#include "hevc_parse.h"
++#endif
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++
++#ifndef FF_API_BUFFER_SIZE_T
++#define FF_API_BUFFER_SIZE_T 1
++#endif
++
++#define DUMP_FAILED_EXTRADATA 0
++
++#if DUMP_FAILED_EXTRADATA
++static inline char hex1(unsigned int x)
+ {
+-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    V4L2Context *const capture = &s->capture;
+-    V4L2Context *const output = &s->output;
+-    struct v4l2_selection selection = { 0 };
+-    int ret;
++    x &= 0xf;
++    return x <= 9 ? '0' + x : 'a' + x - 10;
++}
+ 
+-    /* 1. start the output process */
+-    if (!output->streamon) {
+-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+-        if (ret < 0) {
+-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+-            return ret;
+-        }
++static inline char * hex2(char * s, unsigned int x)
++{
++    *s++ = hex1(x >> 4);
++    *s++ = hex1(x);
++    return s;
++}
++
++static inline char * hex4(char * s, unsigned int x)
++{
++    s = hex2(s, x >> 8);
++    s = hex2(s, x);
++    return s;
++}
++
++static inline char * dash2(char * s)
++{
++    *s++ = '-';
++    *s++ = '-';
++    return s;
++}
++
++static void
++data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len)
++{
++    size_t i;
++    s = hex4(s, offset);
++    m += offset;
++    for (i = 0; i != 8; ++i) {
++        *s++ = ' ';
++        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+     }
++    *s++ = ' ';
++    *s++ = ':';
++    for (; i != 16; ++i) {
++        *s++ = ' ';
++        s = len > i + offset ? hex2(s, *m++) : dash2(s);
++    }
++    *s++ = 0;
++}
+ 
+-    if (capture->streamon)
+-        return 0;
++static void
++log_dump(void * logctx, int lvl, const void * const data, const size_t len)
++{
++    size_t i;
++    for (i = 0; i < len; i += 16) {
++        char buf[80];
++        data16(buf, i, data, len);
++        av_log(logctx, lvl, "%s\n", buf);
++    }
++}
++#endif
+ 
+-    /* 2. get the capture format */
+-    capture->format.type = capture->type;
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
+-        return ret;
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++    return stats->last_interval;
++}
++
++static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess)
++{
++    if (stats->last_count <= 1)
++        return stats->last_pts;
++    if (stats->last_pts == AV_NOPTS_VALUE ||
++            fail_bad_guess && (stats->last_interval == 0 ||
++                               stats->last_count >= STATS_LAST_COUNT_MAX))
++        return AV_NOPTS_VALUE;
++    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
++}
++
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++        if (stats->last_count < STATS_LAST_COUNT_MAX)
++            ++stats->last_count;
++        return;
+     }
+ 
+-    /* 2.1 update the AVCodecContext */
+-    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+-    capture->av_pix_fmt = avctx->pix_fmt;
++    if (stats->last_pts != AV_NOPTS_VALUE) {
++        const int64_t interval = pts - stats->last_pts;
+ 
+-    /* 3. set the crop parameters */
+-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+-    selection.r.height = avctx->coded_height;
+-    selection.r.width = avctx->coded_width;
+-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+-    if (!ret) {
+-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+-        if (ret) {
+-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+-        } else {
+-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+-            /* update the size of the resulting frame */
+-            capture->height = selection.r.height;
+-            capture->width  = selection.r.width;
++        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++            stats->last_count >= STATS_LAST_COUNT_MAX) {
++            if (stats->last_interval != 0)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++                       __func__, stats->name, interval, stats->last_count);
++            stats->last_interval = 0;
++        }
++        else {
++            const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++            if (frame_time != stats->last_interval)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++            stats->last_interval = frame_time;
+         }
+     }
+ 
+-    /* 4. init the capture context now that we have the capture format */
+-    if (!capture->buffers) {
+-        ret = ff_v4l2_context_init(capture);
+-        if (ret) {
+-            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+-            return AVERROR(ENOMEM);
++    stats->last_pts = pts;
++    stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++    *stats = (pts_stats_t){
++        .logctx = logctx,
++        .name = name,
++        .last_count = 1,
++        .last_interval = 0,
++        .last_pts = AV_NOPTS_VALUE
++    };
++}
++
++// If abdata == NULL then this just counts space required
++// Unpacks avcC if detected
++static int
++h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata)
++{
++    const uint8_t * const xdend = extradata + extrasize;
++    const uint8_t * p = extradata;
++    uint8_t * d = abdata;
++    unsigned int n;
++    unsigned int len;
++    const unsigned int hdrlen = 4;
++    unsigned int need_pps = 1;
++
++    if (extrasize < 8)
++        return AVERROR(EINVAL);
++
++    if (p[0] == 0 && p[1] == 0) {
++        // Assume a couple of leading zeros are good enough to indicate NAL
++        if (abdata)
++            memcpy(d, p, extrasize);
++        return extrasize;
++    }
++
++    // avcC starts with a 1
++    if (p[0] != 1)
++        return AVERROR(EINVAL);
++
++    p += 5;
++    n = *p++ & 0x1f;
++
++doxps:
++    while (n--) {
++        if (xdend - p < 2)
++            return AVERROR(EINVAL);
++        len = (p[0] << 8) | p[1];
++        p += 2;
++        if (xdend - p < (ptrdiff_t)len)
++            return AVERROR(EINVAL);
++        if (abdata) {
++            d[0] = 0;
++            d[1] = 0;
++            d[2] = 0;
++            d[3] = 1;
++            memcpy(d + 4, p, len);
+         }
++        d += len + hdrlen;
++        p += len;
++    }
++    if (need_pps) {
++        need_pps = 0;
++        if (p >= xdend)
++            return AVERROR(EINVAL);
++        n = *p++;
++        goto doxps;
+     }
+ 
+-    /* 5. start the capture process */
+-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
++    return d - abdata;
++}
++
++static int
++copy_extradata(AVCodecContext * const avctx,
++               const void * const src_data, const int src_len,
++               void ** const pdst_data, size_t * const pdst_len)
++{
++    int len;
++
++    *pdst_len = 0;
++    av_freep(pdst_data);
++
++    if (avctx->codec_id == AV_CODEC_ID_H264)
++        len = h264_xd_copy(src_data, src_len, NULL);
++    else
++        len = src_len < 0 ? AVERROR(EINVAL) : src_len;
++
++    // Zero length is OK but we want to stop - -ve is error val
++    if (len <= 0)
++        return len;
++
++    if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
++        return AVERROR(ENOMEM);
++
++    if (avctx->codec_id == AV_CODEC_ID_H264)
++        h264_xd_copy(src_data, src_len, *pdst_data);
++    else
++        memcpy(*pdst_data, src_data, len);
++    *pdst_len = len;
++
++    return 0;
++}
++
++
++
++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
++{
++    int ret;
++    struct v4l2_decoder_cmd cmd = {
++        .cmd = V4L2_DEC_CMD_START,
++        .flags = 0,
++    };
++
++    if (s->output.streamon)
++        return 0;
++
++    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
++    if (ret != 0) {
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+         return ret;
+     }
+ 
++    // STREAMON should do implicit START so this just for those that don't.
++    // It is optional so don't worry if it fails
++    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
++        ret = AVERROR(errno);
++        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
++    }
++    else {
++        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
++    }
++    return 0;
++}
++
++static int v4l2_try_start(AVCodecContext *avctx)
++{
++    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++    int ret;
++
++    /* 1. start the output process */
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
+     return 0;
+ }
+ 
+@@ -133,62 +343,760 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
+     return 0;
+ }
+ 
+-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++static void
++set_best_effort_pts(AVCodecContext *const avctx,
++             pts_stats_t * const ps,
++             AVFrame *const frame)
++{
++    pts_stats_add(ps, frame->pts);
++
++    frame->best_effort_timestamp = pts_stats_guess(ps, 1);
++    // If we can't guess from just PTS - try DTS
++    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++        frame->best_effort_timestamp = frame->pkt_dts;
++
++    // We can't emulate what s/w does in a useful manner and using the
++    // "correct" answer seems to just confuse things.
++    frame->pkt_dts               = frame->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++}
++
++static void
++xlat_flush(xlat_track_t * const x)
++{
++    unsigned int i;
++    // Do not reset track_no - this ensures that any frames left in the decoder
++    // that turn up later get discarded.
++
++    x->last_pts = AV_NOPTS_VALUE;
++    x->last_opaque = 0;
++    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
++        x->track_els[i].pending = 0;
++        x->track_els[i].discard = 1;
++    }
++}
++
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    xlat_flush(x);
++}
++
++static int
++xlat_pending(const V4L2m2mContext * const s)
++{
++    const xlat_track_t *const x = &s->xlat;
++    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
++    int i;
++    const int64_t now = pts_stats_guess(&s->pts_stat, 0);
++    int64_t first_dts = AV_NOPTS_VALUE;
++    int no_dts_count = 0;
++    unsigned int interval = pts_stats_interval(&s->pts_stat);
++
++    for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
++        const V4L2m2mTrackEl * const t = x->track_els + n;
++
++        if (first_dts == AV_NOPTS_VALUE)
++            if (t->dts == AV_NOPTS_VALUE)
++                ++no_dts_count;
++            else
++                first_dts = t->dts;
++
++        // Discard only set on never-set or flushed entries
++        // So if we get here we've never successfully decoded a frame so allow
++        // more frames into the buffer before stalling
++        if (t->discard)
++            return i - 16;
++
++        // If we've got this frame out then everything before this point
++        // must have entered the decoder
++        if (!t->pending)
++            break;
++
++        // If we've never seen a pts all we can do is count frames
++        if (now == AV_NOPTS_VALUE)
++            continue;
++
++        if (t->dts != AV_NOPTS_VALUE && now >= t->dts)
++            break;
++    }
++
++    if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) {
++        const int iframes = (first_dts - now) / (int)interval;
++        const int t = iframes - s->reorder_size + no_dts_count;
++
++//        av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n",
++//               x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count);
++
++        if (iframes > 0 && iframes < 64 && t < i) {
++            return t;
++        }
++    }
++
++    return i;
++}
++
++static inline int stream_started(const V4L2m2mContext * const s) {
++    return s->output.streamon;
++}
++
++#define NQ_OK        0
++#define NQ_Q_FULL    1
++#define NQ_SRC_EMPTY 2
++#define NQ_NONE      3
++#define NQ_DRAINING  4
++#define NQ_DEAD      5
++
++#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
++
++// do_not_get      If true then no new packet will be got but status will
++//                  be set appropriately
++
++// AVERROR_EOF     Flushing an already flushed stream
++// -ve             Error (all errors except EOF are unexpected)
++// NQ_OK (0)       OK
++// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
++// NQ_SRC_EMPTY    Src empty (do not retry)
++// NQ_NONE         Enqueue not attempted
++// NQ_DRAINING     At EOS, dQ dest until EOS there too
++// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
++
++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
+ {
+-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    V4L2Context *const capture = &s->capture;
+-    V4L2Context *const output = &s->output;
+     int ret;
+ 
+-    if (!s->buf_pkt.size) {
+-        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++    // If we don't already have a coded packet - get a new one
++    // We will already have a coded pkt if the output Q was full last time we
++    // tried to Q it
++    if (!s->buf_pkt.size && !do_not_get) {
++        unsigned int i;
++
++        for (i = 0; i < 256; ++i) {
++            uint8_t * side_data;
++            size_t side_size;
++
++            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++            if (ret != 0)
++                break;
++
++            // New extradata is the only side-data we undertand
++            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
++            if (side_data) {
++                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
++                if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0)
++                    av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret));
++                s->extdata_sent = 0;
++            }
++
++            if (s->buf_pkt.size != 0)
++                break;
++
++            if (s->buf_pkt.side_data_elems == 0) {
++                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
++                ret = AVERROR_EOF;
++                break;
++            }
++
++            // Retry a side-data only pkt
++        }
++        // If i >= 256 something has gone wrong
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
++            return AVERROR(EIO);
++        }
++
++        if (ret == AVERROR(EAGAIN)) {
++            if (!stream_started(s)) {
++                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
++                return NQ_DEAD;
++            }
++            return NQ_SRC_EMPTY;
++        }
++
++        if (ret == AVERROR_EOF) {
++            // EOF - enter drain mode
++            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
++                   ret, s->buf_pkt.size, stream_started(s), s->draining);
++            if (!stream_started(s)) {
++                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
++                s->draining = 1;
++                s->capture.done = 1;
++                return AVERROR_EOF;
++            }
++
++            if (!s->draining) {
++                // Calling enqueue with an empty pkt starts drain
++                av_assert0(s->buf_pkt.size == 0);
++                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++                if (ret) {
++                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
++                    return ret;
++                }
++            }
++            return NQ_DRAINING;
++        }
++
+         if (ret < 0) {
+-            if (ret == AVERROR(EAGAIN))
+-                return ff_v4l2_context_dequeue_frame(capture, frame, 0);
+-            else if (ret != AVERROR_EOF)
+-                return ret;
++            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
++            return ret;
+         }
+     }
+ 
+-    if (s->draining)
+-        goto dequeue;
++    if (s->draining) {
++        if (s->buf_pkt.size) {
++            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
++            av_packet_unref(&s->buf_pkt);
++        }
++        return NQ_DRAINING;
++    }
++
++    if (!s->buf_pkt.size)
++        return NQ_NONE;
+ 
+-    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
+-    if (ret < 0 && ret != AVERROR(EAGAIN))
+-        goto fail;
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
+ 
+-    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
+-    if (ret != AVERROR(EAGAIN))
++    if (s->extdata_sent)
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++    else
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
++
++    if (ret == AVERROR(EAGAIN)) {
++        // Out of input buffers - keep packet
++        ret = NQ_Q_FULL;
++    }
++    else {
++        // In all other cases we are done with this packet
+         av_packet_unref(&s->buf_pkt);
++        s->extdata_sent = 1;
+ 
+-    if (!s->draining) {
+-        ret = v4l2_try_start(avctx);
+         if (ret) {
+-            /* cant recover */
+-            if (ret != AVERROR(ENOMEM))
+-                ret = 0;
+-            goto fail;
++            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
++            return ret;
+         }
+     }
+ 
+-dequeue:
+-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
+-fail:
+-    av_packet_unref(&s->buf_pkt);
++    // Start if we haven't
++    {
++        const int ret2 = v4l2_try_start(avctx);
++        if (ret2) {
++            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
++            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
++        }
++    }
++
++    return ret;
++}
++
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++    int rv = 0;
++
++    ff_mutex_lock(&ctx->lock);
++
++    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++            rv = AVERROR(errno);
++            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++            break;
++        }
++    }
++
++    ff_mutex_unlock(&ctx->lock);
++    return rv;
++}
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++    int src_rv = -1;
++    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
++    unsigned int i = 0;
++
++    do {
++        const int pending = xlat_pending(s);
++        const int prefer_dq = (pending > 4);
++        const int last_src_rv = src_rv;
++
++        av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
++
++        // Enqueue another pkt for decode if
++        // (a) We don't have a lot of stuff in the buffer already OR
++        // (b) ... we (think we) do but we've failed to get a frame already OR
++        // (c) We've dequeued a lot of frames without asking for input
++        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
++
++        // If we got a frame last time or we've already tried to get a frame and
++        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
++        // indicating that we want more input.
++        // This should mean that once decode starts we enter a stable state where
++        // we alternately ask for input and produce output
++        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
++            break;
++
++        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
++            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
++            break;
++        }
++
++        // Try to get a new frame if
++        // (a) we haven't already got one AND
++        // (b) enqueue returned a status indicating that decode should be attempted
++        if (dst_rv != 0 && TRY_DQ(src_rv)) {
++            // Pick a timeout depending on state
++            // The pending count isn't completely reliable so it is good enough
++            // hint that we want a frame but not good enough to require it in
++            // all cases; however if it has got > 31 that exceeds its margin of
++            // error so require a frame to prevent ridiculous levels of latency
++            const int t =
++                src_rv == NQ_Q_FULL ? -1 :
++                src_rv == NQ_DRAINING ? 300 :
++                prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0;
++
++            // Dequeue frame will unref any previous contents of frame
++            // if it returns success so we don't need an explicit unref
++            // when discarding
++            // This returns AVERROR(EAGAIN) on timeout or if
++            // there is room in the input Q and timeout == -1
++            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++
++            // Failure due to no buffer in Q?
++            if (dst_rv == AVERROR(ENOSPC)) {
++                // Wait & retry
++                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++                }
++            }
++
++            if (dst_rv == 0) {
++                set_best_effort_pts(avctx, &s->pts_stat, frame);
++                if (!s->running) {
++                    s->running = 1;
++                    av_log(avctx, AV_LOG_VERBOSE, "Decode running\n");
++                }
++            }
++
++            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++                dst_rv = AVERROR_EOF;
++                s->capture.done = 1;
++            }
++            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++                       s->draining, s->capture.done);
++            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++                       s->draining, s->capture.done, dst_rv);
++        }
++
++        ++i;
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
++            src_rv = AVERROR(EIO);
++        }
++
++        // Continue trying to enqueue packets if either
++        // (a) we succeeded last time OR
++        // (b) we didn't ret a frame and we can retry the input
++    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
++
++    // Ensure that the frame contains nothing if we aren't returning a frame
++    // (might happen when discarding)
++    if (dst_rv)
++        av_frame_unref(frame);
++
++    // If we got a frame this time ask for a pkt next time
++    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
++
++#if 0
++    if (dst_rv == 0)
++    {
++        static int z = 0;
++        if (++z > 50) {
++            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
++            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++            return -1;
++        }
++    }
++#endif
++
++    return dst_rv == 0 ? 0 :
++        src_rv < 0 ? src_rv :
++        dst_rv < 0 ? dst_rv :
++            AVERROR(EAGAIN);
++}
++
++#if 0
++#include <time.h>
++static int64_t us_time(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++    int ret;
++    const int64_t now = us_time();
++    int64_t done;
++    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++    ret = v4l2_receive_frame2(avctx, frame);
++    done = us_time();
++    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
+     return ret;
+ }
++#endif
++
++static uint32_t
++avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile)
++{
++    switch (codec_id) {
++        case AV_CODEC_ID_H264:
++            switch (avprofile) {
++                case FF_PROFILE_H264_BASELINE:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
++                case FF_PROFILE_H264_CONSTRAINED_BASELINE:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE;
++                case FF_PROFILE_H264_MAIN:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN;
++                case FF_PROFILE_H264_EXTENDED:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED;
++                case FF_PROFILE_H264_HIGH:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH;
++                case FF_PROFILE_H264_HIGH_10:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10;
++                case FF_PROFILE_H264_HIGH_10_INTRA:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA;
++                case FF_PROFILE_H264_MULTIVIEW_HIGH:
++                case FF_PROFILE_H264_HIGH_422:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422;
++                case FF_PROFILE_H264_HIGH_422_INTRA:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA;
++                case FF_PROFILE_H264_STEREO_HIGH:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH;
++                case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE;
++                case FF_PROFILE_H264_HIGH_444_INTRA:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA;
++                case FF_PROFILE_H264_CAVLC_444:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA;
++                case FF_PROFILE_H264_HIGH_444:
++                default:
++                    break;
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE		= 12,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH		= 13,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA	= 14,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH		= 16,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH		= 17,
++            }
++            break;
++        case AV_CODEC_ID_MPEG2VIDEO:
++        case AV_CODEC_ID_MPEG4:
++        case AV_CODEC_ID_VC1:
++        case AV_CODEC_ID_VP8:
++        case AV_CODEC_ID_VP9:
++        case AV_CODEC_ID_AV1:
++            // Most profiles are a simple number that matches the V4L2 enum
++            return avprofile;
++        default:
++            break;
++    }
++    return ~(uint32_t)0;
++}
++
++// This check mirrors Chrome's profile check by testing to see if the profile
++// exists as a possible value for the V4L2 profile control
++static int
++check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
++{
++    struct v4l2_queryctrl query_ctrl;
++    struct v4l2_querymenu query_menu;
++    uint32_t profile_id;
++
++    // An unset profile is almost certainly zero or -99 - do not reject
++    if (avctx->profile <= 0) {
++        av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile);
++        return 0;
++    }
++
++    memset(&query_ctrl, 0, sizeof(query_ctrl));
++    switch (avctx->codec_id) {
++        case AV_CODEC_ID_MPEG2VIDEO:
++            profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE;
++            break;
++        case AV_CODEC_ID_MPEG4:
++            profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE;
++            break;
++        case AV_CODEC_ID_H264:
++            profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
++            break;
++        case AV_CODEC_ID_VP8:
++            profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE;
++            break;
++        case AV_CODEC_ID_VP9:
++            profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE;
++            break;
++#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE
++        case AV_CODEC_ID_AV1:
++            profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE;
++            break;
++#endif
++        default:
++            av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id);
++            return 0;
++    }
++
++    query_ctrl = (struct v4l2_queryctrl){.id = profile_id};
++    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) {
++        av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id);
++    }
++    else {
++        av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id);
++
++        query_menu = (struct v4l2_querymenu){
++            .id = query_ctrl.id,
++            .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile),
++        };
++
++        if (query_menu.index > query_ctrl.maximum ||
++            query_menu.index < query_ctrl.minimum ||
++            ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) {
++            return AVERROR(ENOENT);
++        }
++    }
++
++    return 0;
++};
++
++static int
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    unsigned int i;
++    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
++    const uint32_t w = avctx->coded_width;
++    const uint32_t h = avctx->coded_height;
++
++    if (w == 0 || h == 0 || fcc == 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++
++    for (i = 0;; ++i) {
++        struct v4l2_frmsizeenum fs = {
++            .index = i,
++            .pixel_format = fcc,
++        };
++
++        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
++            const int err = AVERROR(errno);
++            if (err == AVERROR(EINTR))
++                continue;
++            if (i == 0 && err == AVERROR(ENOTTY)) {
++                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
++                return 0;
++            }
++            if (err != AVERROR(EINVAL)) {
++                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
++                return err;
++            }
++            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
++                   w, h, av_fourcc2str(fcc), i);
++            return err;
++        }
++
++        switch (fs.type) {
++            case V4L2_FRMSIZE_TYPE_DISCRETE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
++                       fs.discrete.width,fs.discrete.height);
++                if (w == fs.discrete.width && h == fs.discrete.height)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_STEPWISE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
++                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
++                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
++                    return 0;
++                break;
++            default:
++                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
++                return AVERROR(EINVAL);
++        }
++    }
++}
++
++static int
++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_capability cap;
++
++    memset(&cap, 0, sizeof(cap));
++    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
++        int err = errno;
++        if (err == EINTR)
++            continue;
++        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
++        return AVERROR(err);
++    }
++
++    // Could be made table driven if we have a few more but right now there
++    // seems no point
++
++    // Meson (amlogic) always gives a resolution changed event after output
++    // streamon and userspace must (re)allocate capture buffers and streamon
++    // capture to clear the event even if the capture buffers were the right
++    // size in the first place.
++    if (strcmp(cap.driver, "meson-vdec") == 0)
++        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
++
++    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
++    return 0;
++}
++
++// This heuristic is for H264 but use for everything
++static uint32_t max_coded_size(const AVCodecContext * const avctx)
++{
++    uint32_t wxh = avctx->coded_width * avctx->coded_height;
++    uint32_t size;
++
++    size = wxh * 3 / 2;
++    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
++    // unfortunately that doesn't yield an actually useful limit
++    // and it should be noted that frame 0 is special cased to allow
++    // a bigger number which really isn't helpful for us. So just pick
++    // frame_size / 2
++    size /= 2;
++    // Add 64k to allow for any overheads and/or encoder hopefulness
++    // with small WxH
++    return size + (1 << 16);
++}
++
++static void
++parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    s->reorder_size = 0;
++
++    if (!avctx->extradata || !avctx->extradata_size)
++        return;
++
++    switch (avctx->codec_id) {
++#if CONFIG_H264_DECODER
++        case AV_CODEC_ID_H264:
++        {
++            H264ParamSets ps = {{NULL}};
++            int is_avc = 0;
++            int nal_length_size = 0;
++            int ret;
++
++            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
++                                           &ps, &is_avc, &nal_length_size,
++                                           avctx->err_recognition, avctx);
++            if (ret > 0) {
++                const SPS * sps = NULL;
++                unsigned int i;
++                for (i = 0; i != MAX_SPS_COUNT; ++i) {
++                    if (ps.sps_list[i]) {
++                        sps = (const SPS *)ps.sps_list[i]->data;
++                        break;
++                    }
++                }
++                if (sps) {
++                    avctx->profile = ff_h264_get_profile(sps);
++                    avctx->level = sps->level_idc;
++                    s->reorder_size = sps->num_reorder_frames;
++                }
++            }
++            ff_h264_ps_uninit(&ps);
++            break;
++        }
++#endif
++#if CONFIG_HEVC_DECODER
++        case AV_CODEC_ID_HEVC:
++        {
++            HEVCParamSets ps = {{NULL}};
++            HEVCSEI sei = {{{{0}}}};
++            int is_nalff = 0;
++            int nal_length_size = 0;
++            int ret;
++
++            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
++                                           &ps, &sei, &is_nalff, &nal_length_size,
++                                           avctx->err_recognition, 0, avctx);
++            if (ret > 0) {
++                const HEVCSPS * sps = NULL;
++                unsigned int i;
++                for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) {
++                    if (ps.sps_list[i]) {
++                        sps = (const HEVCSPS *)ps.sps_list[i]->data;
++                        break;
++                    }
++                }
++                if (sps) {
++                    avctx->profile = sps->ptl.general_ptl.profile_idc;
++                    avctx->level   = sps->ptl.general_ptl.level_idc;
++                    s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering;
++                }
++            }
++            ff_hevc_ps_uninit(&ps);
++            ff_hevc_reset_sei(&sei);
++            break;
++        }
++#endif
++        default:
++            break;
++    }
++}
+ 
+ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+ {
+     V4L2Context *capture, *output;
+     V4L2m2mContext *s;
+     V4L2m2mPriv *priv = avctx->priv_data;
++    int gf_pix_fmt;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    if (avctx->codec_id == AV_CODEC_ID_H264) {
++        if (avctx->ticks_per_frame == 1) {
++            if(avctx->time_base.den < INT_MAX/2) {
++                avctx->time_base.den *= 2;
++            } else
++                avctx->time_base.num /= 2;
++        }
++        avctx->ticks_per_frame = 2;
++    }
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+ 
++    parse_extradata(avctx, s);
++
++    xlat_init(&s->xlat);
++    pts_stats_init(&s->pts_stat, avctx, "decoder");
++
+     capture = &s->capture;
+     output = &s->output;
+ 
+@@ -196,14 +1104,65 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+      * the proper values will be retrieved from the kernel driver.
+      */
+-    output->height = capture->height = avctx->coded_height;
+-    output->width = capture->width = avctx->coded_width;
++//    output->height = capture->height = avctx->coded_height;
++//    output->width = capture->width = avctx->coded_width;
++    output->height = capture->height = 0;
++    output->width = capture->width = 0;
+ 
+     output->av_codec_id = avctx->codec_id;
+     output->av_pix_fmt  = AV_PIX_FMT_NONE;
++    output->min_buf_size = max_coded_size(avctx);
+ 
+     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+     capture->av_pix_fmt = avctx->pix_fmt;
++    capture->min_buf_size = 0;
++
++    /* the client requests the codec to generate DRM frames:
++     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
++     *       check the ff_v4l2_buffer_to_avframe conversion function.
++     *   - the DRM frame format is passed in the DRM frame descriptor layer.
++     *       check the v4l2_get_drm_frame function.
++     */
++
++    avctx->sw_pix_fmt = avctx->pix_fmt;
++    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
++    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++           avctx->coded_width, avctx->coded_height,
++           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++
++    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
++        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++        s->output_drm = 1;
++    }
++    else {
++        capture->av_pix_fmt = gf_pix_fmt;
++        s->output_drm = 0;
++    }
++
++    s->db_ctl = NULL;
++    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
++        if (strcmp(priv->dmabuf_alloc, "cma") == 0)
++            s->db_ctl = dmabufs_ctl_new();
++        else {
++            av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc);
++            return AVERROR(EINVAL);
++        }
++        if (!s->db_ctl) {
++            av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc);
++            return AVERROR(ENOMEM);
++        }
++    }
++
++    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
++    if (!s->device_ref) {
++        ret = AVERROR(ENOMEM);
++        return ret;
++    }
++
++    ret = av_hwdevice_ctx_init(s->device_ref);
++    if (ret < 0)
++        return ret;
+ 
+     s->avctx = avctx;
+     ret = ff_v4l2_m2m_codec_init(priv);
+@@ -212,12 +1171,88 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+         return ret;
+     }
+ 
+-    return v4l2_prepare_decoder(s);
++    if (avctx->extradata &&
++        (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret));
++#if DUMP_FAILED_EXTRADATA
++        log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size);
++#endif
++        return ret;
++    }
++
++    if ((ret = v4l2_prepare_decoder(s)) < 0)
++        return ret;
++
++    if ((ret = get_quirks(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = check_size(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = check_profile(avctx, s)) != 0) {
++        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
++        return ret;
++    }
++    return 0;
+ }
+ 
+ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ {
+-    return ff_v4l2_m2m_codec_end(avctx->priv_data);
++    int rv;
++    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
++    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
++    return rv;
++}
++
++static void v4l2_decode_flush(AVCodecContext *avctx)
++{
++    // An alternatve and more drastic form of flush is to simply do this:
++    //    v4l2_decode_close(avctx);
++    //    v4l2_decode_init(avctx);
++    // The downside is that this keeps a decoder open until all the frames
++    // associated with it have been returned.  This is a bit wasteful on
++    // possibly limited h/w resources and fails on a Pi for this reason unless
++    // more GPU mem is allocated than is the default.
++
++    V4L2m2mPriv * const priv = avctx->priv_data;
++    V4L2m2mContext * const s = priv->context;
++    V4L2Context * const output = &s->output;
++    V4L2Context * const capture = &s->capture;
++
++    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
++
++    // Reflushing everything is benign, quick and avoids having to worry about
++    // states like EOS processing so don't try to optimize out (having got it
++    // wrong once)
++
++    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++
++    // Clear any buffered input packet
++    av_packet_unref(&s->buf_pkt);
++
++    // Clear a pending EOS
++    if (ff_v4l2_ctx_eos(capture)) {
++        // Arguably we could delay this but this is easy and doesn't require
++        // thought or extra vars
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
++    }
++
++    // V4L2 makes no guarantees about whether decoded frames are flushed or not
++    // so mark all frames we are tracking to be discarded if they appear
++    xlat_flush(&s->xlat);
++
++    // resend extradata
++    s->extdata_sent = 0;
++    // clear status vars
++    s->running = 0;
++    s->draining = 0;
++    output->done = 0;
++    capture->done = 0;
++
++    // Stream on will occur when we actually submit a new frame
++    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
+ }
+ 
+ #define OFFSET(x) offsetof(V4L2m2mPriv, x)
+@@ -227,9 +1262,16 @@ static const AVOption options[] = {
+     V4L_M2M_DEFAULT_OPTS,
+     { "num_capture_buffers", "Number of buffers in the capture context",
+         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
++    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
++    { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
+     { NULL},
+ };
+ 
++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
++    HW_CONFIG_INTERNAL(DRM_PRIME),
++    NULL
++};
++
+ #define M2MDEC_CLASS(NAME) \
+     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
+         .class_name = #NAME "_v4l2m2m_decoder", \
+@@ -250,11 +1292,17 @@ static const AVOption options[] = {
+         .init           = v4l2_decode_init, \
+         FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \
+         .close          = v4l2_decode_close, \
++        .flush          = v4l2_decode_flush, \
+         .bsfs           = bsf_name, \
+         .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+         .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE | \
+                           FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
+         .p.wrapper_name = "v4l2m2m", \
++        .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
++                                                         AV_PIX_FMT_NV12, \
++                                                         AV_PIX_FMT_YUV420P, \
++                                                         AV_PIX_FMT_NONE}, \
++        .hw_configs     = v4l2_m2m_hw_configs, \
+     }
+ 
+ M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
+diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
+index 9a0837ecf3..524e9424a5 100644
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "encode.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavutil/pixdesc.h"
+@@ -38,6 +40,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+ 
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+     struct v4l2_streamparm parm = { 0 };
+@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+     if (s->avctx->max_b_frames)
+-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+ 
+-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+     if (s->avctx->max_b_frames == 0)
+         return 0;
+ 
+     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+     return AVERROR_PATCHWELCOME;
+ }
+ 
+@@ -271,17 +300,208 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
+     return 0;
+ }
+ 
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        case DRM_FORMAT_P030:
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++    if (a->type != b->type)
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++        unsigned int i;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->num_planes != pb->num_planes)
++            return 0;
++        for (i = 0; i != pa->num_planes; ++i) {
++            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++                return 0;
++        }
++    }
++    else {
++        const struct v4l2_pix_format *const pa = &a->fmt.pix;
++        const struct v4l2_pix_format *const pb = &b->fmt.pix;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->bytesperline != pb->bytesperline)
++            return 0;
++    }
++    return 1;
++}
++
++static inline int q_full(const V4L2Context *const output)
++{
++    return ff_v4l2_context_q_count(output) == output->num_buffers;
++}
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+     V4L2Context *const output = &s->output;
++    int rv;
++    const int needs_slot = q_full(output);
++
++    av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
++
++    // Signal EOF if needed (doesn't need q slot)
++    if (!frame) {
++        av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__);
++        return ff_v4l2_context_enqueue_frame(output, frame);
++    }
++
++    if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) {
++        // We should be able to return AVERROR(EAGAIN) to indicate buffer
++        // exhaustion, but ffmpeg currently treats that as fatal.
++        av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv));
++        return rv;
++    }
++
++    if (s->input_drm && !output->streamon) {
++        struct v4l2_format req_format = {.type = output->format.type};
++
++        // Set format when we first get a buffer
++        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++            return rv;
++        }
++
++        ff_v4l2_context_release(output);
++
++        output->format = req_format;
++
++        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++            return rv;
++        }
++
++        if (!fmt_eq(&req_format, &output->format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++            return AVERROR(EINVAL);
++        }
++
++        output->selection.top = frame->crop_top;
++        output->selection.left = frame->crop_left;
++        output->selection.width = av_frame_cropped_width(frame);
++        output->selection.height = av_frame_cropped_height(frame);
++
++        if ((rv = ff_v4l2_context_init(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++            return rv;
++        }
++
++        {
++            struct v4l2_selection selection = {
++                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++                .target = V4L2_SEL_TGT_CROP,
++                .r = output->selection
++            };
++            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++                       av_err2str(AVERROR(errno)));
++            }
++            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++        }
++    }
+ 
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++    if (frame->pict_type == AV_PICTURE_TYPE_I)
+         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+ 
+-    return ff_v4l2_context_enqueue_frame(output, frame);
++    rv = ff_v4l2_context_enqueue_frame(output, frame);
++    if (rv) {
++        av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv));
++    }
++
++    return rv;
+ }
+ 
+ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+@@ -292,6 +512,11 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+     AVFrame *frame = s->frame;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__,
++           ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture));
++
++    ff_v4l2_dq_all(output, 0);
++
+     if (s->draining)
+         goto dequeue;
+ 
+@@ -328,7 +553,115 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+     }
+ 
+ dequeue:
+-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
++    // Dequeue a frame
++    for (;;) {
++        int t = q_full(output) ? -1 : s->draining ? 300 : 0;
++        int rv2;
++
++        // If output is full wait for either a packet or output to become not full
++        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t);
++
++        // If output was full retry packet dequeue
++        t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300;
++        rv2 = ff_v4l2_dq_all(output, t);
++        if (t == 0 || rv2 != 0)
++            break;
++    }
++    if (ret)
++        return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
++
++    if (capture->first_buf == 1) {
++        uint8_t * data;
++        const int len = avpkt->size;
++
++        // 1st buffer after streamon should be SPS/PPS
++        capture->first_buf = 2;
++
++        // Clear both possible stores so there is no chance of confusion
++        av_freep(&s->extdata_data);
++        s->extdata_size = 0;
++        av_freep(&avctx->extradata);
++        avctx->extradata_size = 0;
++
++        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
++            goto fail_no_mem;
++
++        memcpy(data, avpkt->data, len);
++        av_packet_unref(avpkt);
++
++        // We need to copy the header, but keep local if not global
++        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++            avctx->extradata = data;
++            avctx->extradata_size = len;
++        }
++        else {
++            s->extdata_data = data;
++            s->extdata_size = len;
++        }
++
++        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0);
++        ff_v4l2_dq_all(output, 0);
++        if (ret)
++            return ret;
++    }
++
++    // First frame must be key so mark as such even if encoder forgot
++    if (capture->first_buf == 2) {
++        avpkt->flags |= AV_PKT_FLAG_KEY;
++
++        // Add any extradata to the 1st packet we emit as we cannot create it at init
++        if (avctx->extradata_size > 0 && avctx->extradata) {
++            void * const side = av_packet_new_side_data(avpkt,
++                                           AV_PKT_DATA_NEW_EXTRADATA,
++                                           avctx->extradata_size);
++            if (!side)
++                goto fail_no_mem;
++
++            memcpy(side, avctx->extradata, avctx->extradata_size);
++        }
++    }
++
++    // Add SPS/PPS to the start of every key frame if non-global headers
++    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++        const size_t newlen = s->extdata_size + avpkt->size;
++        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++        if (buf == NULL)
++            goto fail_no_mem;
++
++        memcpy(buf->data, s->extdata_data, s->extdata_size);
++        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++        av_buffer_unref(&avpkt->buf);
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++        avpkt->size = newlen;
++    }
++    else if (ff_v4l2_context_q_count(capture) < 2) {
++        // Avoid running out of capture buffers
++        // In most cases the buffers will be returned quickly in which case
++        // we don't copy and can use the v4l2 buffers directly but sometimes
++        // ffmpeg seems to hold onto all of them for a long time (.mkv
++        // creation?) so avoid deadlock in those cases.
++        AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
++        if (buf == NULL)
++            goto fail_no_mem;
++
++        memcpy(buf->data, avpkt->data, avpkt->size);
++        av_buffer_unref(&avpkt->buf);  // Will recycle the V4L2 buffer
++
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++    }
++
++    capture->first_buf = 0;
++    return 0;
++
++fail_no_mem:
++    av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n");
++    ret = AVERROR(ENOMEM);
++    av_packet_unref(avpkt);
++    return ret;
+ }
+ 
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -340,6 +673,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+     uint32_t v4l2_fmt_output;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+@@ -347,13 +682,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+     capture = &s->capture;
+     output  = &s->output;
+ 
++    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+     /* common settings output/capture */
+     output->height = capture->height = avctx->height;
+     output->width = capture->width = avctx->width;
+ 
+     /* output context */
+     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+-    output->av_pix_fmt = avctx->pix_fmt;
++    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++            AV_PIX_FMT_YUV420P;
+ 
+     /* capture context */
+     capture->av_codec_id = avctx->codec_id;
+@@ -372,7 +711,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+ 
+     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+-    if (pix_fmt_output != avctx->pix_fmt) {
++    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+         return AVERROR(EINVAL);
+@@ -390,9 +729,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx)
+ #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+ 
+ #define V4L_M2M_CAPTURE_OPTS \
+-    V4L_M2M_DEFAULT_OPTS,\
++    { "num_output_buffers", "Number of buffers in the output context",\
++        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\
+     { "num_capture_buffers", "Number of buffers in the capture context", \
+-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS }
++        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS }
+ 
+ static const AVOption mpeg4_options[] = {
+     V4L_M2M_CAPTURE_OPTS,
+diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
+new file mode 100644
+index 0000000000..5b3fb958fa
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.c
+@@ -0,0 +1,84 @@
++#include <memory.h>
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_decode_q.h"
++
++int decode_q_in_q(const req_decode_ent * const d)
++{
++    return d->in_q;
++}
++
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
++{
++    pthread_mutex_lock(&q->q_lock);
++    if (!q->head) {
++        q->head = d;
++        q->tail = d;
++        d->prev = NULL;
++    }
++    else {
++        q->tail->next = d;
++        d->prev = q->tail;
++        q->tail = d;
++    }
++    d->next = NULL;
++    d->in_q = 1;
++    pthread_mutex_unlock(&q->q_lock);
++}
++
++// Remove entry from Q - if head wake-up anything that was waiting
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
++{
++    int try_signal = 0;
++
++    if (!d->in_q)
++        return;
++
++    pthread_mutex_lock(&q->q_lock);
++    if (d->prev)
++        d->prev->next = d->next;
++    else {
++        try_signal = 1;  // Only need to signal if we were head
++        q->head = d->next;
++    }
++
++    if (d->next)
++        d->next->prev = d->prev;
++    else
++        q->tail = d->prev;
++
++    // Not strictly needed but makes debug easier
++    d->next = NULL;
++    d->prev = NULL;
++    d->in_q = 0;
++    pthread_mutex_unlock(&q->q_lock);
++
++    if (try_signal)
++        pthread_cond_broadcast(&q->q_cond);
++}
++
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
++{
++    pthread_mutex_lock(&q->q_lock);
++
++    while (q->head != d)
++        pthread_cond_wait(&q->q_cond, &q->q_lock);
++
++    pthread_mutex_unlock(&q->q_lock);
++}
++
++void decode_q_uninit(req_decode_q * const q)
++{
++    pthread_mutex_destroy(&q->q_lock);
++    pthread_cond_destroy(&q->q_cond);
++}
++
++void decode_q_init(req_decode_q * const q)
++{
++    memset(q, 0, sizeof(*q));
++    pthread_mutex_init(&q->q_lock, NULL);
++    pthread_cond_init(&q->q_cond, NULL);
++}
++
++
+diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h
+new file mode 100644
+index 0000000000..af7bbe1de4
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.h
+@@ -0,0 +1,25 @@
++#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
++#define AVCODEC_V4L2_REQ_DECODE_Q_H
++
++typedef struct req_decode_ent {
++    struct req_decode_ent * next;
++    struct req_decode_ent * prev;
++    int in_q;
++} req_decode_ent;
++
++typedef struct req_decode_q {
++    pthread_mutex_t q_lock;
++    pthread_cond_t q_cond;
++    req_decode_ent * head;
++    req_decode_ent * tail;
++} req_decode_q;
++
++int decode_q_in_q(const req_decode_ent * const d);
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_uninit(req_decode_q * const q);
++void decode_q_init(req_decode_q * const q);
++
++#endif
++
+diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
+new file mode 100644
+index 0000000000..cfa94d55c4
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.c
+@@ -0,0 +1,449 @@
++#include <errno.h>
++#include <fcntl.h>
++#include <libudev.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++
++#include <sys/ioctl.h>
++#include <sys/sysmacros.h>
++
++#include <linux/media.h>
++#include <linux/videodev2.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_utils.h"
++
++struct decdev {
++    enum v4l2_buf_type src_type;
++    uint32_t src_fmt_v4l2;
++    const char * vname;
++    const char * mname;
++};
++
++struct devscan {
++    struct decdev env;
++    unsigned int dev_size;
++    unsigned int dev_count;
++    struct decdev *devs;
++};
++
++static int video_src_pixfmt_supported(uint32_t fmt)
++{
++    return 1;
++}
++
++static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
++                  unsigned int width, unsigned int height,
++                  unsigned int pixelformat)
++{
++    unsigned int sizeimage;
++
++    memset(format, 0, sizeof(*format));
++    format->type = type;
++
++    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++        format->fmt.pix_mp.width = width;
++        format->fmt.pix_mp.height = height;
++        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
++        format->fmt.pix_mp.pixelformat = pixelformat;
++    } else {
++        format->fmt.pix.width = width;
++        format->fmt.pix.height = height;
++        format->fmt.pix.sizeimage = sizeimage;
++        format->fmt.pix.pixelformat = pixelformat;
++    }
++}
++
++static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
++            unsigned int width, unsigned int height)
++{
++    struct v4l2_format format;
++
++    v4l2_setup_format(&format, type, width, height, pixelformat);
++
++    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
++}
++
++static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
++{
++    struct v4l2_capability capability = { 0 };
++    int rc;
++
++    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
++    if (rc < 0)
++        return -errno;
++
++    if (capabilities != NULL) {
++        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
++            *capabilities = capability.device_caps;
++        else
++            *capabilities = capability.capabilities;
++    }
++
++    return 0;
++}
++
++static int devscan_add(struct devscan *const scan,
++                       enum v4l2_buf_type src_type,
++                       uint32_t src_fmt_v4l2,
++                       const char * vname,
++                       const char * mname)
++{
++    struct decdev *d;
++
++    if (scan->dev_size <= scan->dev_count) {
++        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
++        d = realloc(scan->devs, n * sizeof(*d));
++        if (!d)
++            return -ENOMEM;
++        scan->devs = d;
++        scan->dev_size = n;
++    }
++
++    d = scan->devs + scan->dev_count;
++    d->src_type = src_type;
++    d->src_fmt_v4l2 = src_fmt_v4l2;
++    d->vname = strdup(vname);
++    if (!d->vname)
++        return -ENOMEM;
++    d->mname = strdup(mname);
++    if (!d->mname) {
++        free((char *)d->vname);
++        return -ENOMEM;
++    }
++    ++scan->dev_count;
++    return 0;
++}
++
++void devscan_delete(struct devscan **const pScan)
++{
++    unsigned int i;
++    struct devscan * const scan = *pScan;
++
++    if (!scan)
++        return;
++    *pScan = NULL;
++
++    for (i = 0; i < scan->dev_count; ++i) {
++        free((char*)scan->devs[i].mname);
++        free((char*)scan->devs[i].vname);
++    }
++    free(scan->devs);
++    free(scan);
++}
++
++#define REQ_BUF_CAPS (\
++    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
++    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
++    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
++
++static void probe_formats(void * const dc,
++              struct devscan *const scan,
++              const int fd,
++              const unsigned int type_v4l2,
++              const char *const mpath,
++              const char *const vpath)
++{
++    unsigned int i;
++    for (i = 0;; ++i) {
++        struct v4l2_fmtdesc fmtdesc = {
++            .index = i,
++            .type = type_v4l2
++        };
++        struct v4l2_requestbuffers rbufs = {
++            .count = 0,
++            .type = type_v4l2,
++            .memory = V4L2_MEMORY_MMAP
++        };
++        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++            if (errno == EINTR)
++                continue;
++            if (errno != EINVAL)
++                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
++            return;
++        }
++        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
++            continue;
++
++        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
++            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
++            continue;
++        }
++
++        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
++            if (errno != EINTR) {
++                request_debug(dc, "%s: Reqbufs failed\n", vpath);
++                continue;
++            }
++        }
++
++        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
++            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
++            continue;
++        }
++
++        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
++                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
++        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
++    }
++}
++
++
++static int probe_video_device(void * const dc,
++                   struct udev_device *const device,
++                   struct devscan *const scan,
++                   const char *const mpath)
++{
++    int ret;
++    unsigned int capabilities = 0;
++    int video_fd = -1;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        request_err(dc, "%s: get video device devnode failed\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    video_fd = open(path, O_RDWR, 0);
++    if (video_fd == -1) {
++        ret = -errno;
++        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++        goto fail;
++    }
++
++    ret = v4l2_query_capabilities(video_fd, &capabilities);
++    if (ret < 0) {
++        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
++
++    if (!(capabilities & V4L2_CAP_STREAMING)) {
++        request_debug(dc, "%s: missing required streaming capability\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
++        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    /* Should check capture formats too... */
++    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
++        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
++    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
++        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
++
++    close(video_fd);
++    return 0;
++
++fail:
++    if (video_fd >= 0)
++        close(video_fd);
++    return ret;
++}
++
++static int probe_media_device(void * const dc,
++                   struct udev_device *const device,
++                   struct devscan *const scan)
++{
++    int ret;
++    int rv;
++    struct media_device_info device_info = { 0 };
++    struct media_v2_topology topology = { 0 };
++    struct media_v2_interface *interfaces = NULL;
++    struct udev *udev = udev_device_get_udev(device);
++    struct udev_device *video_device;
++    dev_t devnum;
++    int media_fd = -1;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        request_err(dc, "%s: get media device devnode failed\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    media_fd = open(path, O_RDWR, 0);
++    if (media_fd < 0) {
++        ret = -errno;
++        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
++    if (rv < 0) {
++        ret = -errno;
++        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (rv < 0) {
++        ret = -errno;
++        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    if (topology.num_interfaces <= 0) {
++        request_err(dc, "%s: media device has no interfaces\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
++    if (!interfaces) {
++        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
++    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (rv < 0) {
++        ret = -errno;
++        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    for (int i = 0; i < topology.num_interfaces; i++) {
++        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
++            continue;
++
++        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
++        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
++        if (!video_device) {
++            ret = -errno;
++            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
++            continue;
++        }
++
++        ret = probe_video_device(dc, video_device, scan, path);
++        udev_device_unref(video_device);
++
++        if (ret != 0)
++            goto fail;
++    }
++
++fail:
++    free(interfaces);
++    if (media_fd != -1)
++        close(media_fd);
++    return ret;
++}
++
++const char *decdev_media_path(const struct decdev *const dev)
++{
++    return !dev ? NULL : dev->mname;
++}
++
++const char *decdev_video_path(const struct decdev *const dev)
++{
++    return !dev ? NULL : dev->vname;
++}
++
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
++{
++    return !dev ? 0 : dev->src_type;
++}
++
++uint32_t decdev_src_pixelformat(const struct decdev *const dev)
++{
++    return !dev ? 0 : dev->src_fmt_v4l2;
++}
++
++
++const struct decdev *devscan_find(struct devscan *const scan,
++                  const uint32_t src_fmt_v4l2)
++{
++    unsigned int i;
++
++    if (scan->env.mname && scan->env.vname)
++        return &scan->env;
++
++    if (!src_fmt_v4l2)
++        return scan->dev_count ? scan->devs + 0 : NULL;
++
++    for (i = 0; i != scan->dev_count; ++i) {
++        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
++            return scan->devs + i;
++    }
++    return NULL;
++}
++
++int devscan_build(void * const dc, struct devscan **pscan)
++{
++    int ret;
++    struct udev *udev;
++    struct udev_enumerate *enumerate;
++    struct udev_list_entry *devices;
++    struct udev_list_entry *entry;
++    struct udev_device *device;
++    struct devscan * scan;
++
++    *pscan = NULL;
++
++    scan = calloc(1, sizeof(*scan));
++    if (!scan) {
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
++    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
++    if (scan->env.mname && scan->env.vname) {
++        request_info(dc, "Media/video device env overrides found: %s,%s\n",
++                 scan->env.mname, scan->env.vname);
++        *pscan = scan;
++        return 0;
++    }
++
++    udev = udev_new();
++    if (!udev) {
++        request_err(dc, "%s: allocating udev context failed\n", __func__);
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    enumerate = udev_enumerate_new(udev);
++    if (!enumerate) {
++        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    udev_enumerate_add_match_subsystem(enumerate, "media");
++    udev_enumerate_scan_devices(enumerate);
++
++    devices = udev_enumerate_get_list_entry(enumerate);
++    udev_list_entry_foreach(entry, devices) {
++        const char *path = udev_list_entry_get_name(entry);
++        if (!path)
++            continue;
++
++        device = udev_device_new_from_syspath(udev, path);
++        if (!device)
++            continue;
++
++        probe_media_device(dc, device, scan);
++        udev_device_unref(device);
++    }
++
++    udev_enumerate_unref(enumerate);
++
++    *pscan = scan;
++    return 0;
++
++fail:
++    udev_unref(udev);
++    devscan_delete(&scan);
++    return ret;
++}
++
+diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
+new file mode 100644
+index 0000000000..956d9234f1
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -0,0 +1,23 @@
++#ifndef _DEVSCAN_H_
++#define _DEVSCAN_H_
++
++#include <stdint.h>
++
++struct devscan;
++struct decdev;
++enum v4l2_buf_type;
++
++/* These return pointers to data in the devscan structure and so are vaild
++ * for the lifetime of that
++ */
++const char *decdev_media_path(const struct decdev *const dev);
++const char *decdev_video_path(const struct decdev *const dev);
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
++uint32_t decdev_src_pixelformat(const struct decdev *const dev);
++
++const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++
++int devscan_build(void * const dc, struct devscan **pscan);
++void devscan_delete(struct devscan **const pScan);
++
++#endif
+diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
+new file mode 100644
+index 0000000000..acc0366e76
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.c
+@@ -0,0 +1,369 @@
++#include <stdatomic.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <inttypes.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <linux/mman.h>
++#include <linux/dma-buf.h>
++#include <linux/dma-heap.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_utils.h"
++
++#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
++#define DMABUF_NAME2  "/dev/dma_heap/reserved"
++
++#define TRACE_ALLOC 0
++
++struct dmabufs_ctl;
++struct dmabuf_h;
++
++struct dmabuf_fns {
++    int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size);
++    void (*buf_free)(struct dmabuf_h * dh);
++    int (*ctl_new)(struct dmabufs_ctl * dbsc);
++    void (*ctl_free)(struct dmabufs_ctl * dbsc);
++};
++
++struct dmabufs_ctl {
++    atomic_int ref_count;
++    int fd;
++    size_t page_size;
++    void * v;
++    const struct dmabuf_fns * fns;
++};
++
++struct dmabuf_h {
++    int fd;
++    size_t size;
++    size_t len;
++    void * mapptr;
++    void * v;
++    const struct dmabuf_fns * fns;
++};
++
++#if TRACE_ALLOC
++static unsigned int total_bufs = 0;
++static size_t total_size = 0;
++#endif
++
++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size)
++{
++    struct dmabuf_h *dh;
++
++    if (mapptr == MAP_FAILED)
++        return NULL;
++
++    dh = malloc(sizeof(*dh));
++    if (!dh)
++        return NULL;
++
++    *dh = (struct dmabuf_h) {
++        .fd = -1,
++        .size = size,
++        .mapptr = mapptr
++    };
++
++    return dh;
++}
++
++struct dmabuf_h * dmabuf_import(int fd, size_t size)
++{
++    struct dmabuf_h *dh;
++
++    fd = dup(fd);
++    if (fd < 0  || size == 0)
++        return NULL;
++
++    dh = malloc(sizeof(*dh));
++    if (!dh) {
++        close(fd);
++        return NULL;
++    }
++
++    *dh = (struct dmabuf_h) {
++        .fd = fd,
++        .size = size,
++        .mapptr = MAP_FAILED
++    };
++
++#if TRACE_ALLOC
++    ++total_bufs;
++    total_size += dh->size;
++    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++    return dh;
++}
++
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
++{
++    struct dmabuf_h * dh;
++    if (old != NULL) {
++        if (old->size >= size) {
++            return old;
++        }
++        dmabuf_free(old);
++    }
++
++    if (size == 0 ||
++        (dh = malloc(sizeof(*dh))) == NULL)
++        return NULL;
++
++    *dh = (struct dmabuf_h){
++        .fd = -1,
++        .mapptr = MAP_FAILED,
++        .fns = dbsc->fns
++    };
++
++    if (dh->fns->buf_alloc(dbsc, dh, size) != 0)
++        goto fail;
++
++
++#if TRACE_ALLOC
++    ++total_bufs;
++    total_size += dh->size;
++    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++    return dh;
++
++fail:
++    free(dh);
++    return NULL;
++}
++
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
++{
++    struct dma_buf_sync sync = {
++        .flags = flags
++    };
++    if (dh->fd == -1)
++        return 0;
++    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
++        const int err = errno;
++        if (errno == EINTR)
++            continue;
++        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
++        return -err;
++    }
++    return 0;
++}
++
++int dmabuf_write_start(struct dmabuf_h * const dh)
++{
++    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_write_end(struct dmabuf_h * const dh)
++{
++    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_read_start(struct dmabuf_h * const dh)
++{
++    if (!dmabuf_map(dh))
++        return -1;
++    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
++}
++
++int dmabuf_read_end(struct dmabuf_h * const dh)
++{
++    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
++}
++
++
++void * dmabuf_map(struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return NULL;
++    if (dh->mapptr != MAP_FAILED)
++        return dh->mapptr;
++    dh->mapptr = mmap(NULL, dh->size,
++              PROT_READ | PROT_WRITE,
++              MAP_SHARED | MAP_POPULATE,
++              dh->fd, 0);
++    if (dh->mapptr == MAP_FAILED) {
++        request_log("%s: Map failed\n", __func__);
++        return NULL;
++    }
++    return dh->mapptr;
++}
++
++int dmabuf_fd(const struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return -1;
++    return dh->fd;
++}
++
++size_t dmabuf_size(const struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return 0;
++    return dh->size;
++}
++
++size_t dmabuf_len(const struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return 0;
++    return dh->len;
++}
++
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
++{
++    dh->len = len;
++}
++
++void dmabuf_free(struct dmabuf_h * dh)
++{
++    if (!dh)
++        return;
++
++#if TRACE_ALLOC
++    --total_bufs;
++    total_size -= dh->size;
++    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++    dh->fns->buf_free(dh);
++
++    if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
++        munmap(dh->mapptr, dh->size);
++    if (dh->fd != -1)
++        while (close(dh->fd) == -1 && errno == EINTR)
++            /* loop */;
++    free(dh);
++}
++
++static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns)
++{
++    struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc));
++
++    if (!dbsc)
++        return NULL;
++
++    dbsc->fd = -1;
++    dbsc->fns = fns;
++    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
++
++    if (fns->ctl_new(dbsc) != 0)
++        goto fail;
++
++    return dbsc;
++
++fail:
++    free(dbsc);
++    return NULL;
++}
++
++static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
++{
++    request_debug(NULL, "Free dmabuf ctl\n");
++
++    dbsc->fns->ctl_free(dbsc);
++
++    free(dbsc);
++}
++
++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc)
++{
++    struct dmabufs_ctl * const dbsc = *pDbsc;
++
++    if (!dbsc)
++        return;
++    *pDbsc = NULL;
++
++    if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0)
++        return;
++
++    dmabufs_ctl_free(dbsc);
++}
++
++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
++{
++    atomic_fetch_add(&dbsc->ref_count, 1);
++    return dbsc;
++}
++
++//-----------------------------------------------------------------------------
++//
++// Alloc dmabuf via CMA
++
++static int ctl_cma_new(struct dmabufs_ctl * dbsc)
++{
++    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
++           errno == EINTR)
++        /* Loop */;
++
++    if (dbsc->fd == -1) {
++        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
++               errno == EINTR)
++            /* Loop */;
++        if (dbsc->fd == -1) {
++            request_log("Unable to open either %s or %s\n",
++                    DMABUF_NAME1, DMABUF_NAME2);
++            return -1;
++        }
++    }
++    return 0;
++}
++
++static void ctl_cma_free(struct dmabufs_ctl * dbsc)
++{
++    if (dbsc->fd != -1)
++        while (close(dbsc->fd) == -1 && errno == EINTR)
++            /* loop */;
++
++}
++
++static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
++{
++    struct dma_heap_allocation_data data = {
++        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
++        .fd = 0,
++        .fd_flags = O_RDWR,
++        .heap_flags = 0
++    };
++
++    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
++        int err = errno;
++        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
++                (uint64_t)data.len,
++                dbsc->fd,
++                err,
++                strerror(err));
++        if (err == EINTR)
++            continue;
++        return -err;
++    }
++
++    dh->fd = data.fd;
++    dh->size = (size_t)data.len;
++    return 0;
++}
++
++static void buf_cma_free(struct dmabuf_h * dh)
++{
++    // Nothing needed
++}
++
++static const struct dmabuf_fns dmabuf_cma_fns = {
++    .buf_alloc  = buf_cma_alloc,
++    .buf_free   = buf_cma_free,
++    .ctl_new    = ctl_cma_new,
++    .ctl_free   = ctl_cma_free,
++};
++
++struct dmabufs_ctl * dmabufs_ctl_new(void)
++{
++    request_debug(NULL, "Dmabufs using CMA\n");;
++    return dmabufs_ctl_new2(&dmabuf_cma_fns);
++}
++
+diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
+new file mode 100644
+index 0000000000..381ba2708d
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.h
+@@ -0,0 +1,44 @@
++#ifndef DMABUFS_H
++#define DMABUFS_H
++
++#include <stddef.h>
++
++struct dmabufs_ctl;
++struct dmabuf_h;
++
++struct dmabufs_ctl * dmabufs_ctl_new(void);
++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);
++
++// Need not preserve old contents
++// On NULL return old buffer is freed
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
++
++static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
++    return dmabuf_realloc(dbsc, NULL, size);
++}
++/* Create from existing fd - dups(fd) */
++struct dmabuf_h * dmabuf_import(int fd, size_t size);
++/* Import an MMAP - return NULL if mapptr = MAP_FAIL */
++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size);
++
++void * dmabuf_map(struct dmabuf_h * const dh);
++
++/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
++
++int dmabuf_write_start(struct dmabuf_h * const dh);
++int dmabuf_write_end(struct dmabuf_h * const dh);
++int dmabuf_read_start(struct dmabuf_h * const dh);
++int dmabuf_read_end(struct dmabuf_h * const dh);
++
++int dmabuf_fd(const struct dmabuf_h * const dh);
++/* Allocated size */
++size_t dmabuf_size(const struct dmabuf_h * const dh);
++/* Bytes in use */
++size_t dmabuf_len(const struct dmabuf_h * const dh);
++/* Set bytes in use */
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
++void dmabuf_free(struct dmabuf_h * dh);
++
++#endif
+diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c
+new file mode 100644
+index 0000000000..169b532832
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v1.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 1
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c
+new file mode 100644
+index 0000000000..42af98e156
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v2.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 2
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c
+new file mode 100644
+index 0000000000..dcc8d95632
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v3.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 3
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
+new file mode 100644
+index 0000000000..c35579d8e0
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v4.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 4
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
+new file mode 100644
+index 0000000000..e1bd5c6a1f
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_vx.c
+@@ -0,0 +1,1362 @@
++// File included by v4l2_req_hevc_v* - not compiled on its own
++
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++#include "internal.h"
++#include "thread.h"
++
++#if HEVC_CTRLS_VERSION == 1
++#include "hevc-ctrls-v1.h"
++
++// Fixup renamed entries
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
++
++#elif HEVC_CTRLS_VERSION == 2
++#include "hevc-ctrls-v2.h"
++#elif HEVC_CTRLS_VERSION == 3
++#include "hevc-ctrls-v3.h"
++#elif HEVC_CTRLS_VERSION == 4
++#include <linux/v4l2-controls.h>
++#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
++#include "hevc-ctrls-v4.h"
++#endif
++#else
++#error Unknown HEVC_CTRLS_VERSION
++#endif
++
++#ifndef V4L2_CID_STATELESS_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
++#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
++
++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
++#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
++#endif
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++// Attached to buf[0] in frame
++// Pooled in hwcontext so generally create once - 1/frame
++typedef struct V4L2MediaReqDescriptor {
++    AVDRMFrameDescriptor drm;
++
++    // Media
++    uint64_t timestamp;
++    struct qent_dst * qe_dst;
++
++    // Decode only - should be NULL by the time we emit the frame
++    struct req_decode_ent decode_ent;
++
++    struct media_request *req;
++    struct qent_src *qe_src;
++
++#if HEVC_CTRLS_VERSION >= 2
++    struct v4l2_ctrl_hevc_decode_params dec;
++#endif
++
++    size_t num_slices;
++    size_t alloced_slices;
++    struct v4l2_ctrl_hevc_slice_params * slice_params;
++    struct slice_info * slices;
++
++    size_t num_offsets;
++    size_t alloced_offsets;
++    uint32_t *offsets;
++
++} V4L2MediaReqDescriptor;
++
++struct slice_info {
++    const uint8_t * ptr;
++    size_t len; // bytes
++    size_t n_offsets;
++};
++
++// Handy container for accumulating controls before setting
++struct req_controls {
++    int has_scaling;
++    struct timeval tv;
++    struct v4l2_ctrl_hevc_sps sps;
++    struct v4l2_ctrl_hevc_pps pps;
++    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
++};
++
++//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++
++// Get an FFmpeg format from the v4l2 format
++static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
++{
++    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
++            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
++    case V4L2_PIX_FMT_YUV420:
++        return AV_PIX_FMT_YUV420P;
++    case V4L2_PIX_FMT_NV12:
++        return AV_PIX_FMT_NV12;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        return AV_PIX_FMT_RPI4_8;
++    case V4L2_PIX_FMT_NV12_10_COL128:
++        return AV_PIX_FMT_RPI4_10;
++#endif
++    default:
++        break;
++    }
++    return AV_PIX_FMT_NONE;
++}
++
++static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
++{
++    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++    return rd->timestamp;
++}
++
++static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
++{
++    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++    rd->timestamp = dpb_stamp;
++}
++
++static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
++{
++    int32_t luma_weight_denom, chroma_weight_denom;
++    const SliceHeader *sh = &h->sh;
++
++    if (sh->slice_type == HEVC_SLICE_I ||
++        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
++        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
++        return;
++
++    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
++
++    if (h->ps.sps->chroma_format_idc)
++        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
++
++    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
++    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
++        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
++        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
++        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
++        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
++        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
++    }
++
++    if (sh->slice_type != HEVC_SLICE_B)
++        return;
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
++        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
++        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
++        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
++        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
++        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
++    }
++}
++
++#if HEVC_CTRLS_VERSION <= 2
++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
++{
++    const HEVCFrame *frame;
++    int i;
++
++    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
++        frame = h->rps[ST_CURR_BEF].ref[i];
++        if (frame && timestamp == frame_capture_dpb(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
++    }
++
++    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
++        frame = h->rps[ST_CURR_AFT].ref[i];
++        if (frame && timestamp == frame_capture_dpb(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
++    }
++
++    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
++        frame = h->rps[LT_CURR].ref[i];
++        if (frame && timestamp == frame_capture_dpb(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
++    }
++
++    return 0;
++}
++#endif
++
++static unsigned int
++get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
++                  const struct v4l2_hevc_dpb_entry * const entries,
++                  const unsigned int num_entries)
++{
++    uint64_t timestamp;
++
++    if (!frame)
++        return 0;
++
++    timestamp = frame_capture_dpb(frame->frame);
++
++    for (unsigned int i = 0; i < num_entries; i++) {
++        if (entries[i].timestamp == timestamp)
++            return i;
++    }
++
++    return 0;
++}
++
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++    unsigned int z = 0;
++    while (idx--) {
++        if (*b++ == 0) {
++            ++z;
++            if (z >= 2 && *b == 3) {
++                ++b;
++                z = 0;
++            }
++        }
++        else {
++            z = 0;
++        }
++    }
++    return b;
++}
++
++static int slice_add(V4L2MediaReqDescriptor * const rd)
++{
++    if (rd->num_slices >= rd->alloced_slices) {
++        struct v4l2_ctrl_hevc_slice_params * p2;
++        struct slice_info * s2;
++        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
++
++        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
++        if (p2 == NULL)
++            return AVERROR(ENOMEM);
++        rd->slice_params = p2;
++
++        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
++        if (s2 == NULL)
++            return AVERROR(ENOMEM);
++        rd->slices = s2;
++
++        rd->alloced_slices = n2;
++    }
++    ++rd->num_slices;
++    return 0;
++}
++
++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
++{
++    if (rd->num_offsets + n > rd->alloced_offsets) {
++        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
++        void * p2;
++        while (rd->num_offsets + n > n2)
++            n2 *= 2;
++        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
++            return AVERROR(ENOMEM);
++        rd->offsets = p2;
++        rd->alloced_offsets = n2;
++    }
++    for (size_t i = 0; i != n; ++i)
++        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
++    return 0;
++}
++
++static unsigned int
++fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
++{
++    unsigned int i;
++    unsigned int n = 0;
++    const HEVCFrame * const pic = h->ref;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
++        const HEVCFrame * const frame = &h->DPB[i];
++        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
++            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
++
++            entry->timestamp = frame_capture_dpb(frame->frame);
++#if HEVC_CTRLS_VERSION <= 2
++            entry->rps = find_frame_rps_type(h, entry->timestamp);
++#else
++            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
++                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
++#endif
++            entry->field_pic = frame->frame->interlaced_frame;
++
++#if HEVC_CTRLS_VERSION <= 3
++            /* TODO: Interleaved: Get the POC for each field. */
++            entry->pic_order_cnt[0] = frame->poc;
++            entry->pic_order_cnt[1] = frame->poc;
++#else
++            entry->pic_order_cnt_val = frame->poc;
++#endif
++        }
++    }
++    return n;
++}
++
++static void fill_slice_params(const HEVCContext * const h,
++#if HEVC_CTRLS_VERSION >= 2
++                              const struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++                              struct v4l2_ctrl_hevc_slice_params *slice_params,
++                              uint32_t bit_size, uint32_t bit_offset)
++{
++    const SliceHeader * const sh = &h->sh;
++#if HEVC_CTRLS_VERSION >= 2
++    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
++    const unsigned int dpb_n = dec->num_active_dpb_entries;
++#else
++    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
++    unsigned int dpb_n;
++#endif
++    unsigned int i;
++    RefPicList *rpl;
++
++    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
++        .bit_size = bit_size,
++#if HEVC_CTRLS_VERSION <= 3
++        .data_bit_offset = bit_offset,
++#else
++        .data_byte_offset = bit_offset / 8 + 1,
++#endif
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_segment_addr = sh->slice_segment_addr,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++        .nal_unit_type = h->nal_unit_type,
++        .nuh_temporal_id_plus1 = h->temporal_id + 1,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_type = sh->slice_type,
++        .colour_plane_id = sh->colour_plane_id,
++        .slice_pic_order_cnt = h->ref->poc,
++        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
++        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
++        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
++        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
++        .slice_qp_delta = sh->slice_qp_delta,
++        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
++        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
++        .slice_act_y_qp_offset = 0,
++        .slice_act_cb_qp_offset = 0,
++        .slice_act_cr_qp_offset = 0,
++        .slice_beta_offset_div2 = sh->beta_offset / 2,
++        .slice_tc_offset_div2 = sh->tc_offset / 2,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++        .pic_struct = h->sei.picture_timing.picture_struct,
++
++#if HEVC_CTRLS_VERSION < 2
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++#endif
++    };
++
++    if (sh->slice_sample_adaptive_offset_flag[0])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
++
++    if (sh->slice_sample_adaptive_offset_flag[1])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
++
++    if (sh->slice_temporal_mvp_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
++
++    if (sh->mvd_l1_zero_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
++
++    if (sh->cabac_init_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
++
++    if (sh->collocated_list == L0)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
++
++    if (sh->disable_deblocking_filter_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
++
++    if (sh->slice_loop_filter_across_slices_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (sh->dependent_slice_segment_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++#if HEVC_CTRLS_VERSION < 2
++    dpb_n = fill_dpb_entries(h, dpb);
++    slice_params->num_active_dpb_entries = dpb_n;
++#endif
++
++    if (sh->slice_type != HEVC_SLICE_I) {
++        rpl = &h->ref->refPicList[0];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++    }
++
++    if (sh->slice_type == HEVC_SLICE_B) {
++        rpl = &h->ref->refPicList[1];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++    }
++
++    fill_pred_table(h, &slice_params->pred_weight_table);
++
++    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++#if HEVC_CTRLS_VERSION <= 3
++    if (slice_params->num_entry_point_offsets > 256) {
++        slice_params->num_entry_point_offsets = 256;
++        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
++    }
++
++    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
++        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++#endif
++}
++
++#if HEVC_CTRLS_VERSION >= 2
++static void
++fill_decode_params(const HEVCContext * const h,
++                   struct v4l2_ctrl_hevc_decode_params * const dec)
++{
++    unsigned int i;
++
++    *dec = (struct v4l2_ctrl_hevc_decode_params){
++        .pic_order_cnt_val = h->poc,
++        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++    };
++
++    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
++
++    // The docn does seem to ask that we fit our 32 bit signed POC into
++    // a U8 so... (To be fair 16 bits would be enough)
++    // Luckily we (Pi) don't use these fields
++    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
++        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
++    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
++        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
++    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
++        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
++
++    if (IS_IRAP(h))
++        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
++    if (IS_IDR(h))
++        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
++    if (h->sh.no_output_of_prior_pics_flag)
++        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
++
++}
++#endif
++
++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
++{
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++    *ctrl = (struct v4l2_ctrl_hevc_sps) {
++        .chroma_format_idc = sps->chroma_format_idc,
++        .pic_width_in_luma_samples = sps->width,
++        .pic_height_in_luma_samples = sps->height,
++        .bit_depth_luma_minus8 = sps->bit_depth - 8,
++        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
++        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
++        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
++        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
++        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
++        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
++        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
++        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
++        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
++        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
++        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
++        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
++        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
++        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
++        .num_short_term_ref_pic_sets = sps->nb_st_rps,
++        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
++        .chroma_format_idc = sps->chroma_format_idc,
++        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
++    };
++
++    if (sps->separate_colour_plane_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++
++    if (sps->scaling_list_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
++
++    if (sps->amp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
++
++    if (sps->sao_enabled)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
++
++    if (sps->pcm_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
++
++    if (sps->pcm.loop_filter_disable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
++
++    if (sps->long_term_ref_pics_present_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
++
++    if (sps->sps_temporal_mvp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
++
++    if (sps->sps_strong_intra_smoothing_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
++}
++
++static void fill_scaling_matrix(const ScalingList * const sl,
++                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
++{
++    unsigned int i;
++
++    for (i = 0; i < 6; i++) {
++        unsigned int j;
++
++        for (j = 0; j < 16; j++)
++            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
++        for (j = 0; j < 64; j++) {
++            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
++            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
++            if (i < 2)
++                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
++        }
++        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
++        if (i < 2)
++            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
++    }
++}
++
++static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
++{
++    uint64_t flags = 0;
++
++    if (pps->dependent_slice_segments_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
++
++    if (pps->output_flag_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
++
++    if (pps->sign_data_hiding_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
++
++    if (pps->cabac_init_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
++
++    if (pps->constrained_intra_pred_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++
++    if (pps->transform_skip_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
++
++    if (pps->cu_qp_delta_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
++
++    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
++
++    if (pps->weighted_pred_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
++
++    if (pps->weighted_bipred_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
++
++    if (pps->transquant_bypass_enable_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
++
++    if (pps->tiles_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
++
++    if (pps->entropy_coding_sync_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
++
++    if (pps->loop_filter_across_tiles_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
++
++    if (pps->seq_loop_filter_across_slices_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (pps->deblocking_filter_override_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
++
++    if (pps->disable_dbf)
++        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
++
++    if (pps->lists_modification_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
++
++    if (pps->slice_header_extension_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
++
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++    *ctrl = (struct v4l2_ctrl_hevc_pps) {
++        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
++        .init_qp_minus26 = pps->pic_init_qp_minus26,
++        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
++        .pps_cb_qp_offset = pps->cb_qp_offset,
++        .pps_cr_qp_offset = pps->cr_qp_offset,
++        .pps_beta_offset_div2 = pps->beta_offset / 2,
++        .pps_tc_offset_div2 = pps->tc_offset / 2,
++        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
++        .flags = flags
++    };
++
++
++    if (pps->tiles_enabled_flag) {
++        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
++        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
++
++        for (int i = 0; i < pps->num_tile_columns; i++)
++            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
++
++        for (int i = 0; i < pps->num_tile_rows; i++)
++            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
++    }
++}
++
++// Called before finally returning the frame to the user
++// Set corrupt flag here as this is actually the frame structure that
++// is going to the user (in MT land each thread has its own pool)
++static int frame_post_process(void *logctx, AVFrame *frame)
++{
++    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
++
++//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
++    if (rd->qe_dst) {
++        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
++        if (stat != MEDIABUFS_STATUS_SUCCESS) {
++            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
++            frame->flags |= AV_FRAME_FLAG_CORRUPT;
++        }
++    }
++
++    return 0;
++}
++
++static inline struct timeval cvt_dpb_to_tv(uint64_t t)
++{
++    t /= 1000;
++    return (struct timeval){
++        .tv_usec = t % 1000000,
++        .tv_sec = t / 1000000
++    };
++}
++
++static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
++{
++    return (uint64_t)t * 1000;
++}
++
++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
++                                         av_unused const uint8_t *buffer,
++                                         av_unused uint32_t size)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++    decode_q_add(&ctx->decode_q, &rd->decode_ent);
++
++    rd->num_slices = 0;
++    ctx->timestamp++;
++    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
++
++    {
++        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
++        fdd->post_process = frame_post_process;
++    }
++
++    // qe_dst needs to be bound to the data buffer and only returned when that is
++    if (!rd->qe_dst)
++    {
++        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++            return AVERROR(ENOMEM);
++        }
++    }
++
++    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++    return 0;
++}
++
++// Object fd & size will be zapped by this & need setting later
++static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
++{
++    AVDRMLayerDescriptor *layer = &desc->layers[0];
++    unsigned int width;
++    unsigned int height;
++    unsigned int bpl;
++    uint32_t pixelformat;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        width       = format->fmt.pix_mp.width;
++        height      = format->fmt.pix_mp.height;
++        pixelformat = format->fmt.pix_mp.pixelformat;
++        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
++    }
++    else {
++        width       = format->fmt.pix.width;
++        height      = format->fmt.pix.height;
++        pixelformat = format->fmt.pix.pixelformat;
++        bpl         = format->fmt.pix.bytesperline;
++    }
++
++    switch (pixelformat) {
++    case V4L2_PIX_FMT_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++        break;
++    case V4L2_PIX_FMT_NV12_10_COL128:
++        layer->format = DRM_FORMAT_P030;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++        break;
++#endif
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
++        break;
++#endif
++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
++    case V4L2_PIX_FMT_NV15:
++        layer->format = DRM_FORMAT_NV15;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#endif
++    case V4L2_PIX_FMT_NV16:
++        layer->format = DRM_FORMAT_NV16;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
++    case V4L2_PIX_FMT_NV20:
++        layer->format = DRM_FORMAT_NV20;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#endif
++    default:
++        return -1;
++    }
++
++    desc->nb_objects = 1;
++    desc->objects[0].fd = -1;
++    desc->objects[0].size = 0;
++
++    desc->nb_layers = 1;
++    layer->nb_planes = 2;
++
++    layer->planes[0].object_index = 0;
++    layer->planes[0].offset = 0;
++    layer->planes[0].pitch = bpl;
++#if CONFIG_SAND
++    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = height * 128;
++        layer->planes[0].pitch = width;
++        layer->planes[1].pitch = width;
++    }
++    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = height * 128;
++        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
++        layer->planes[1].pitch = width * 2;
++    }
++    else
++#endif
++    {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = layer->planes[0].pitch * height;
++        layer->planes[1].pitch = layer->planes[0].pitch;
++    }
++
++    return 0;
++}
++
++static int
++set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
++    struct req_controls *const controls,
++#if HEVC_CTRLS_VERSION >= 2
++    struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
++    void * const offsets, const size_t offset_count)
++{
++    int rv;
++#if HEVC_CTRLS_VERSION >= 2
++    unsigned int n = 3;
++#else
++    unsigned int n = 2;
++#endif
++
++    struct v4l2_ext_control control[6] = {
++        {
++            .id = V4L2_CID_STATELESS_HEVC_SPS,
++            .ptr = &controls->sps,
++            .size = sizeof(controls->sps),
++        },
++        {
++            .id = V4L2_CID_STATELESS_HEVC_PPS,
++            .ptr = &controls->pps,
++            .size = sizeof(controls->pps),
++        },
++#if HEVC_CTRLS_VERSION >= 2
++        {
++            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
++            .ptr = dec,
++            .size = sizeof(*dec),
++        },
++#endif
++    };
++
++    if (slices)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
++            .ptr = slices,
++            .size = sizeof(*slices) * slice_count,
++        };
++
++    if (controls->has_scaling)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        };
++
++#if HEVC_CTRLS_VERSION >= 4
++    if (offsets)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
++            .ptr = offsets,
++            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
++        };
++#endif
++
++    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
++
++    return rv;
++}
++
++// This only works because we started out from a single coded frame buffer
++// that will remain intact until after end_frame
++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const HEVCContext * const h = avctx->priv_data;
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++    int bcount = get_bits_count(&h->HEVClc->gb);
++    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
++
++    const unsigned int n = rd->num_slices;
++    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
++
++    int rv;
++    struct slice_info * si;
++
++    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
++    // that contains the entire frame including the start code
++    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
++        buffer -= 3;
++        size += 3;
++        boff += 24;
++        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
++            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
++                   buffer[0], buffer[1], buffer[2]);
++        }
++    }
++
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
++        if (rd->slices == NULL) {
++            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
++                return AVERROR(ENOMEM);
++            rd->slices->ptr = buffer;
++            rd->num_slices = 1;
++        }
++        rd->slices->len = buffer - rd->slices->ptr + size;
++        return 0;
++    }
++
++    if ((rv = slice_add(rd)) != 0)
++        return rv;
++
++    si = rd->slices + n;
++    si->ptr = buffer;
++    si->len = size;
++    si->n_offsets = rd->num_offsets;
++
++    if (n != block_start) {
++        struct slice_info *const si0 = rd->slices + block_start;
++        const size_t offset = (buffer - si0->ptr);
++        boff += offset * 8;
++        size += offset;
++        si0->len = si->len + offset;
++    }
++
++#if HEVC_CTRLS_VERSION >= 2
++    if (n == 0)
++        fill_decode_params(h, &rd->dec);
++    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
++#else
++    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
++#endif
++    if (ctx->max_offsets != 0 &&
++        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
++        return rv;
++
++    return 0;
++}
++
++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
++{
++    const HEVCContext * const h = avctx->priv_data;
++    if (h->ref != NULL) {
++        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++        media_request_abort(&rd->req);
++        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
++
++        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++    }
++}
++
++static int send_slice(AVCodecContext * const avctx,
++                      V4L2MediaReqDescriptor * const rd,
++                      struct req_controls *const controls,
++                      const unsigned int i, const unsigned int j)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++    const int is_last = (j == rd->num_slices);
++    struct slice_info *const si = rd->slices + i;
++    struct media_request * req = NULL;
++    struct qent_src * src = NULL;
++    MediaBufsStatus stat;
++    void * offsets = rd->offsets + rd->slices[i].n_offsets;
++    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
++
++    if ((req = media_request_get(ctx->mpool)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
++        return AVERROR(ENOMEM);
++    }
++
++    if (set_req_ctls(ctx, req,
++                     controls,
++#if HEVC_CTRLS_VERSION >= 2
++                     &rd->dec,
++#endif
++                     rd->slice_params + i, j - i,
++                     offsets, n_offsets)) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
++        goto fail1;
++    }
++
++    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
++        goto fail1;
++    }
++
++    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
++        goto fail2;
++    }
++
++    if (qent_src_params_set(src, &controls->tv)) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
++        goto fail2;
++    }
++
++    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
++                                   i == 0 ? rd->qe_dst : NULL,
++                                   is_last);
++
++    if (stat != MEDIABUFS_STATUS_SUCCESS) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
++        return AVERROR_UNKNOWN;
++    }
++    return 0;
++
++fail2:
++    mediabufs_src_qent_abort(ctx->mbufs, &src);
++fail1:
++    media_request_abort(&req);
++    return AVERROR_UNKNOWN;
++}
++
++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
++{
++    const HEVCContext * const h = avctx->priv_data;
++    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    struct req_controls rc;
++    unsigned int i;
++    int rv;
++
++    // It is possible, though maybe a bug, to get an end_frame without
++    // a previous start_frame.  If we do then give up.
++    if (!decode_q_in_q(&rd->decode_ent)) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
++        return AVERROR_INVALIDDATA;
++    }
++
++    {
++        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
++                                    &h->ps.pps->scaling_list :
++                                h->ps.sps->scaling_list_enable_flag ?
++                                    &h->ps.sps->scaling_list : NULL;
++
++
++        memset(&rc, 0, sizeof(rc));
++        rc.tv = cvt_dpb_to_tv(rd->timestamp);
++        fill_sps(&rc.sps, h->ps.sps);
++        fill_pps(&rc.pps, h->ps.pps);
++        if (sl) {
++            rc.has_scaling = 1;
++            fill_scaling_matrix(sl, &rc.scaling_matrix);
++        }
++    }
++
++    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
++
++    // qe_dst needs to be bound to the data buffer and only returned when that is
++    // Alloc almost certainly wants to be serialised if there is any chance of blocking
++    // so we get the next frame to be free in the thread that needs it for decode first.
++    //
++    // In our current world this probably isn't a concern but put it here anyway
++    if (!rd->qe_dst)
++    {
++        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++            rv = AVERROR(ENOMEM);
++            goto fail;
++        }
++    }
++
++    // Send as slices
++    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
++        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
++        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
++            goto fail;
++    }
++
++    // Set the drm_prime desriptor
++    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
++    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
++    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
++
++    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++    return 0;
++
++fail:
++    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++    return rv;
++}
++
++static inline int
++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
++{
++    return v >= c->minimum && v <= c->maximum;
++}
++
++// Initial check & init
++static int
++probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++    const HEVCContext *h = avctx->priv_data;
++    const HEVCSPS * const sps = h->ps.sps;
++    struct v4l2_ctrl_hevc_sps ctrl_sps;
++    unsigned int i;
++
++    // Check for var slice array
++    struct v4l2_query_ext_ctrl qc[] = {
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SPS },
++        { .id = V4L2_CID_STATELESS_HEVC_PPS },
++        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
++#if HEVC_CTRLS_VERSION >= 2
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
++#endif
++    };
++    // Order & size must match!
++    static const size_t ctrl_sizes[] = {
++        sizeof(struct v4l2_ctrl_hevc_slice_params),
++        sizeof(int32_t),
++        sizeof(struct v4l2_ctrl_hevc_sps),
++        sizeof(struct v4l2_ctrl_hevc_pps),
++        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
++#if HEVC_CTRLS_VERSION >= 2
++        sizeof(struct v4l2_ctrl_hevc_decode_params),
++#endif
++    };
++    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
++
++#if HEVC_CTRLS_VERSION == 2
++    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#elif HEVC_CTRLS_VERSION == 3
++    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#endif
++
++    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
++    i = 0;
++#if HEVC_CTRLS_VERSION >= 4
++    // Skip slice check if no slice mode
++    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        i = 1;
++#else
++    // Fail frame mode silently for anything prior to V4
++    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        return AVERROR(EINVAL);
++#endif
++    for (; i != noof_ctrls; ++i) {
++        if (qc[i].type == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
++            return AVERROR(EINVAL);
++        }
++        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
++                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
++            return AVERROR(EINVAL);
++        }
++    }
++
++    fill_sps(&ctrl_sps, sps);
++
++    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++// Final init
++static int
++set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++    int ret;
++
++    struct v4l2_query_ext_ctrl querys[] = {
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
++#if HEVC_CTRLS_VERSION >= 4
++        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
++#endif
++    };
++
++    struct v4l2_ext_control ctrls[] = {
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++    };
++
++    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
++
++    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
++                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
++        1 : querys[2].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
++
++#if HEVC_CTRLS_VERSION >= 4
++    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
++        0 : querys[3].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
++#else
++    ctx->max_offsets = 0;
++#endif
++
++    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
++        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
++        ctx->decode_mode = querys[0].default_value;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
++        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
++        ctx->start_code = querys[1].default_value;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    // If we are in slice mode & START_CODE_NONE supported then pick that
++    // as it doesn't require the slightly dodgy look backwards in our raw buffer
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
++        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++
++    ctrls[0].value = ctx->decode_mode;
++    ctrls[1].value = ctx->start_code;
++
++    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
++    return !ret ? 0 : AVERROR(-ret);
++}
++
++static void v4l2_req_frame_free(void *opaque, uint8_t *data)
++{
++    AVCodecContext *avctx = opaque;
++    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
++
++    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
++
++    qent_dst_unref(&rd->qe_dst);
++
++    // We don't expect req or qe_src to be set
++    if (rd->req || rd->qe_src)
++        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
++
++    av_freep(&rd->slices);
++    av_freep(&rd->slice_params);
++    av_freep(&rd->offsets);
++
++    av_free(rd);
++}
++
++static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
++{
++    AVCodecContext *avctx = opaque;
++//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++//    V4L2MediaReqDescriptor *req;
++    AVBufferRef *ref;
++    uint8_t *data;
++//    int ret;
++
++    data = av_mallocz(size);
++    if (!data)
++        return NULL;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
++    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
++    if (!ref) {
++        av_freep(&data);
++        return NULL;
++    }
++    return ref;
++}
++
++#if 0
++static void v4l2_req_pool_free(void *opaque)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
++}
++
++static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
++
++    av_buffer_pool_uninit(&hwfc->pool);
++}
++#endif
++
++static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
++    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
++
++    hwfc->format = AV_PIX_FMT_DRM_PRIME;
++    hwfc->sw_format = pixel_format_from_format(vfmt);
++    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
++        hwfc->width = vfmt->fmt.pix_mp.width;
++        hwfc->height = vfmt->fmt.pix_mp.height;
++    } else {
++        hwfc->width = vfmt->fmt.pix.width;
++        hwfc->height = vfmt->fmt.pix.height;
++    }
++#if 0
++    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
++    if (!hwfc->pool)
++        return AVERROR(ENOMEM);
++
++    hwfc->free = v4l2_req_hwframe_ctx_free;
++
++    hwfc->initial_pool_size = 1;
++
++    switch (avctx->codec_id) {
++    case AV_CODEC_ID_VP9:
++        hwfc->initial_pool_size += 8;
++        break;
++    case AV_CODEC_ID_VP8:
++        hwfc->initial_pool_size += 3;
++        break;
++    default:
++        hwfc->initial_pool_size += 2;
++    }
++#endif
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
++
++    return 0;
++}
++
++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    int rv;
++
++    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
++    if (!frame->buf[0])
++        return AVERROR(ENOMEM);
++
++    frame->data[0] = frame->buf[0]->data;
++
++    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
++
++    if ((rv = ff_attach_decode_data(frame)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
++        av_frame_unref(frame);
++        return rv;
++    }
++
++    return 0;
++}
++
++const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
++    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
++    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
++    .probe = probe,
++    .set_controls = set_controls,
++
++    .start_frame    = v4l2_request_hevc_start_frame,
++    .decode_slice   = v4l2_request_hevc_decode_slice,
++    .end_frame      = v4l2_request_hevc_end_frame,
++    .abort_frame    = v4l2_request_hevc_abort_frame,
++    .frame_params   = frame_params,
++    .alloc_frame    = alloc_frame,
++};
++
+diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
+new file mode 100644
+index 0000000000..1a9944774a
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.c
+@@ -0,0 +1,1802 @@
++/*
++ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <errno.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++#include <linux/media.h>
++#include <linux/mman.h>
++#include <sys/ioctl.h>
++#include <sys/select.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++
++#include <linux/videodev2.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++#include "weak_link.h"
++
++
++/* floor(log2(x)) */
++static unsigned int log2_size(size_t x)
++{
++    unsigned int n = 0;
++
++    if (x & ~0xffff) {
++        n += 16;
++        x >>= 16;
++    }
++    if (x & ~0xff) {
++        n += 8;
++        x >>= 8;
++    }
++    if (x & ~0xf) {
++        n += 4;
++        x >>= 4;
++    }
++    if (x & ~3) {
++        n += 2;
++        x >>= 2;
++    }
++    return (x & ~1) ? n + 1 : n;
++}
++
++static size_t round_up_size(const size_t x)
++{
++    /* Admit no size < 256 */
++    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
++
++    return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++struct media_request;
++
++struct media_pool {
++    int fd;
++    sem_t sem;
++    pthread_mutex_t lock;
++    struct media_request * free_reqs;
++    struct pollqueue * pq;
++};
++
++struct media_request {
++    struct media_request * next;
++    struct media_pool * mp;
++    int fd;
++    struct polltask * pt;
++};
++
++static inline enum v4l2_memory
++mediabufs_memory_to_v4l2(const enum mediabufs_memory m)
++{
++    return (enum v4l2_memory)m;
++}
++
++const char *
++mediabufs_memory_name(const enum mediabufs_memory m)
++{
++    switch (m) {
++    case MEDIABUFS_MEMORY_UNSET:
++        return "Unset";
++    case MEDIABUFS_MEMORY_MMAP:
++        return "MMap";
++    case MEDIABUFS_MEMORY_USERPTR:
++        return "UserPtr";
++    case MEDIABUFS_MEMORY_OVERLAY:
++        return "Overlay";
++    case MEDIABUFS_MEMORY_DMABUF:
++        return "DMABuf";
++    default:
++        break;
++    }
++    return "Unknown";
++}
++
++
++static inline int do_trywait(sem_t *const sem)
++{
++    while (sem_trywait(sem)) {
++        if (errno != EINTR)
++            return -errno;
++    }
++    return 0;
++}
++
++static inline int do_wait(sem_t *const sem)
++{
++    while (sem_wait(sem)) {
++        if (errno != EINTR)
++            return -errno;
++    }
++    return 0;
++}
++
++static int request_buffers(int video_fd, unsigned int type,
++                           enum mediabufs_memory memory, unsigned int buffers_count)
++{
++    struct v4l2_requestbuffers buffers;
++    int rc;
++
++    memset(&buffers, 0, sizeof(buffers));
++    buffers.type = type;
++    buffers.memory = mediabufs_memory_to_v4l2(memory);
++    buffers.count = buffers_count;
++
++    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
++    if (rc < 0) {
++        rc = -errno;
++        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
++        return rc;
++    }
++
++    return 0;
++}
++
++
++static int set_stream(int video_fd, unsigned int type, bool enable)
++{
++    enum v4l2_buf_type buf_type = type;
++    int rc;
++
++    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
++           &buf_type);
++    if (rc < 0) {
++        rc = -errno;
++        request_log("Unable to %sable stream: %s\n",
++                enable ? "en" : "dis", strerror(-rc));
++        return rc;
++    }
++
++    return 0;
++}
++
++
++
++struct media_request * media_request_get(struct media_pool * const mp)
++{
++    struct media_request *req = NULL;
++
++    /* Timeout handled by poll code */
++    if (do_wait(&mp->sem))
++        return NULL;
++
++    pthread_mutex_lock(&mp->lock);
++    req = mp->free_reqs;
++    if (req) {
++        mp->free_reqs = req->next;
++        req->next = NULL;
++    }
++    pthread_mutex_unlock(&mp->lock);
++    return req;
++}
++
++int media_request_fd(const struct media_request * const req)
++{
++    return req->fd;
++}
++
++int media_request_start(struct media_request * const req)
++{
++    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
++    {
++        const int err = errno;
++        if (err == EINTR)
++            continue;
++        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
++        return -err;
++    }
++
++    pollqueue_add_task(req->pt, 2000);
++    return 0;
++}
++
++static void media_request_done(void *v, short revents)
++{
++    struct media_request *const req = v;
++    struct media_pool *const mp = req->mp;
++
++    /* ** Not sure what to do about timeout */
++
++    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
++        request_log("Unable to reinit media request: %s\n",
++                strerror(errno));
++
++    pthread_mutex_lock(&mp->lock);
++    req->next = mp->free_reqs;
++    mp->free_reqs = req;
++    pthread_mutex_unlock(&mp->lock);
++    sem_post(&mp->sem);
++}
++
++int media_request_abort(struct media_request ** const preq)
++{
++    struct media_request * const req = *preq;
++
++    if (req == NULL)
++        return 0;
++    *preq = NULL;
++
++    media_request_done(req, 0);
++    return 0;
++}
++
++static void delete_req_chain(struct media_request * const chain)
++{
++    struct media_request * next = chain;
++    while (next) {
++        struct media_request * const req = next;
++        next = req->next;
++        if (req->pt)
++            polltask_delete(&req->pt);
++        if (req->fd != -1)
++            close(req->fd);
++        free(req);
++    }
++}
++
++struct media_pool * media_pool_new(const char * const media_path,
++                   struct pollqueue * const pq,
++                   const unsigned int n)
++{
++    struct media_pool * const mp = calloc(1, sizeof(*mp));
++    unsigned int i;
++
++    if (!mp)
++        goto fail0;
++
++    mp->pq = pq;
++    pthread_mutex_init(&mp->lock, NULL);
++    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
++    if (mp->fd == -1) {
++        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
++        goto fail1;
++    }
++
++    for (i = 0; i != n; ++i) {
++        struct media_request * req = malloc(sizeof(*req));
++        if (!req)
++            goto fail4;
++
++        *req = (struct media_request){
++            .next = mp->free_reqs,
++            .mp = mp,
++            .fd = -1
++        };
++        mp->free_reqs = req;
++
++        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
++            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
++            goto fail4;
++        }
++
++        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
++        if (!req->pt)
++            goto fail4;
++    }
++
++    sem_init(&mp->sem, 0, n);
++
++    return mp;
++
++fail4:
++    delete_req_chain(mp->free_reqs);
++    close(mp->fd);
++    pthread_mutex_destroy(&mp->lock);
++fail1:
++    free(mp);
++fail0:
++    return NULL;
++}
++
++void media_pool_delete(struct media_pool ** pMp)
++{
++    struct media_pool * const mp = *pMp;
++
++    if (!mp)
++        return;
++    *pMp = NULL;
++
++    delete_req_chain(mp->free_reqs);
++    close(mp->fd);
++    sem_destroy(&mp->sem);
++    pthread_mutex_destroy(&mp->lock);
++    free(mp);
++}
++
++
++#define INDEX_UNSET (~(uint32_t)0)
++
++enum qent_status {
++    QENT_NEW = 0,       // Initial state - shouldn't last
++    QENT_FREE,          // On free chain
++    QENT_PENDING,       // User has ent
++    QENT_WAITING,       // On inuse
++    QENT_DONE,          // Frame rx
++    QENT_ERROR,         // Error
++    QENT_IMPORT
++};
++
++struct qent_base {
++    atomic_int ref_count;
++    struct qent_base *next;
++    struct qent_base *prev;
++    enum qent_status status;
++    enum mediabufs_memory memtype;
++    uint32_t index;
++    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
++    struct timeval timestamp;
++};
++
++struct qent_src {
++    struct qent_base base;
++    int fixed_size;
++};
++
++struct qent_dst {
++    struct qent_base base;
++    bool waiting;
++    pthread_mutex_t lock;
++    pthread_cond_t cond;
++    struct ff_weak_link_client * mbc_wl;
++};
++
++struct qe_list_head {
++    struct qent_base *head;
++    struct qent_base *tail;
++};
++
++struct buf_pool {
++    enum mediabufs_memory memtype;
++    pthread_mutex_t lock;
++    sem_t free_sem;
++    struct qe_list_head free;
++    struct qe_list_head inuse;
++};
++
++
++static inline struct qent_dst *base_to_dst(struct qent_base *be)
++{
++    return (struct qent_dst *)be;
++}
++
++static inline struct qent_src *base_to_src(struct qent_base *be)
++{
++    return (struct qent_src *)be;
++}
++
++
++#define QENT_BASE_INITIALIZER(mtype) {\
++    .ref_count = ATOMIC_VAR_INIT(0),\
++    .status = QENT_NEW,\
++    .memtype = (mtype),\
++    .index  = INDEX_UNSET\
++}
++
++static void qe_base_uninit(struct qent_base *const be)
++{
++    unsigned int i;
++    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
++        dmabuf_free(be->dh[i]);
++        be->dh[i] = NULL;
++    }
++}
++
++static void qe_src_free(struct qent_src *const be_src)
++{
++    if (!be_src)
++        return;
++    qe_base_uninit(&be_src->base);
++    free(be_src);
++}
++
++static struct qent_src * qe_src_new(enum mediabufs_memory mtype)
++{
++    struct qent_src *const be_src = malloc(sizeof(*be_src));
++    if (!be_src)
++        return NULL;
++    *be_src = (struct qent_src){
++        .base = QENT_BASE_INITIALIZER(mtype)
++    };
++    return be_src;
++}
++
++static void qe_dst_free(struct qent_dst *const be_dst)
++{
++    if (!be_dst)
++        return;
++
++    ff_weak_link_unref(&be_dst->mbc_wl);
++    pthread_cond_destroy(&be_dst->cond);
++    pthread_mutex_destroy(&be_dst->lock);
++    qe_base_uninit(&be_dst->base);
++    free(be_dst);
++}
++
++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype)
++{
++    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
++    if (!be_dst)
++        return NULL;
++    *be_dst = (struct qent_dst){
++        .base = QENT_BASE_INITIALIZER(memtype),
++        .lock = PTHREAD_MUTEX_INITIALIZER,
++        .cond = PTHREAD_COND_INITIALIZER,
++        .mbc_wl = ff_weak_link_ref(wl)
++    };
++    return be_dst;
++}
++
++static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
++{
++    if (ql->tail)
++        ql->tail->next = be;
++    else
++        ql->head = be;
++    be->prev = ql->tail;
++    be->next = NULL;
++    ql->tail = be;
++}
++
++static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
++{
++    if (!be)
++        return NULL;
++
++    if (be->next)
++        be->next->prev = be->prev;
++    else
++        ql->tail = be->prev;
++    if (be->prev)
++        be->prev->next = be->next;
++    else
++        ql->head = be->next;
++    be->next = NULL;
++    be->prev = NULL;
++    return be;
++}
++
++
++static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
++{
++    ql_add_tail(&bp->free, be);
++}
++
++static struct qent_base * bq_get_free(struct buf_pool *const bp)
++{
++    return ql_extract(&bp->free, bp->free.head);
++}
++
++static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
++{
++    return ql_extract(&bp->inuse, be);
++}
++
++static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
++{
++    return ql_extract(&bp->inuse, bp->inuse.head);
++}
++
++static void bq_free_all_free_src(struct buf_pool *const bp)
++{
++    struct qent_base *be;
++    while ((be = bq_get_free(bp)) != NULL)
++        qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_inuse_src(struct buf_pool *const bp)
++{
++    struct qent_base *be;
++    while ((be = bq_get_inuse(bp)) != NULL)
++        qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_free_dst(struct buf_pool *const bp)
++{
++    struct qent_base *be;
++    while ((be = bq_get_free(bp)) != NULL)
++        qe_dst_free(base_to_dst(be));
++}
++
++static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
++{
++    unsigned int i;
++
++    pthread_mutex_lock(&bp->lock);
++    /* Clear out state vars */
++    be->timestamp.tv_sec = 0;
++    be->timestamp.tv_usec = 0;
++    be->status = QENT_FREE;
++    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
++        dmabuf_len_set(be->dh[i], 0);
++    bq_put_free(bp, be);
++    pthread_mutex_unlock(&bp->lock);
++    sem_post(&bp->free_sem);
++}
++
++static bool queue_is_inuse(const struct buf_pool *const bp)
++{
++    return bp->inuse.tail != NULL;
++}
++
++static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
++{
++    if (!be)
++        return;
++    pthread_mutex_lock(&bp->lock);
++    ql_add_tail(&bp->inuse, be);
++    be->status = QENT_WAITING;
++    pthread_mutex_unlock(&bp->lock);
++}
++
++static struct qent_base *queue_get_free(struct buf_pool *const bp)
++{
++    struct qent_base *buf;
++
++    if (do_wait(&bp->free_sem))
++        return NULL;
++    pthread_mutex_lock(&bp->lock);
++    buf = bq_get_free(bp);
++    pthread_mutex_unlock(&bp->lock);
++    return buf;
++}
++
++static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
++{
++    struct qent_base *buf;
++
++    if (do_trywait(&bp->free_sem))
++        return NULL;
++    pthread_mutex_lock(&bp->lock);
++    buf = bq_get_free(bp);
++    pthread_mutex_unlock(&bp->lock);
++    return buf;
++}
++
++static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index)
++{
++    struct qent_base *be;
++
++    pthread_mutex_lock(&bp->lock);
++    /* Expect 1st in Q, but allow anywhere */
++    for (be = bp->inuse.head; be; be = be->next) {
++        if (be->index == index) {
++            bq_extract_inuse(bp, be);
++            break;
++        }
++    }
++    pthread_mutex_unlock(&bp->lock);
++
++    return be;
++}
++
++static void queue_delete(struct buf_pool *const bp)
++{
++    sem_destroy(&bp->free_sem);
++    pthread_mutex_destroy(&bp->lock);
++    free(bp);
++}
++
++static struct buf_pool* queue_new(const int vfd)
++{
++    struct buf_pool *bp = calloc(1, sizeof(*bp));
++    if (!bp)
++        return NULL;
++    pthread_mutex_init(&bp->lock, NULL);
++    sem_init(&bp->free_sem, 0, 0);
++    return bp;
++}
++
++
++struct mediabufs_ctl {
++    atomic_int ref_count;  /* 0 is single ref for easier atomics */
++    void * dc;
++    int vfd;
++    bool stream_on;
++    bool polling;
++    bool dst_fixed;             // Dst Q is fixed size
++    pthread_mutex_t lock;
++    struct buf_pool * src;
++    struct buf_pool * dst;
++    struct polltask * pt;
++    struct pollqueue * pq;
++    struct ff_weak_link_master * this_wlm;
++
++    enum mediabufs_memory src_memtype;
++    enum mediabufs_memory dst_memtype;
++    struct v4l2_format src_fmt;
++    struct v4l2_format dst_fmt;
++    struct v4l2_capability capability;
++};
++
++static int qe_v4l2_queue(struct qent_base *const be,
++               const int vfd, struct media_request *const mreq,
++               const struct v4l2_format *const fmt,
++               const bool is_dst, const bool hold_flag)
++{
++    struct v4l2_buffer buffer = {
++        .type = fmt->type,
++        .memory = mediabufs_memory_to_v4l2(be->memtype),
++        .index = be->index
++    };
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        unsigned int i;
++        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++            if (is_dst)
++                dmabuf_len_set(be->dh[i], 0);
++
++            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
++            planes[i].length = dmabuf_size(be->dh[i]);
++            planes[i].bytesused = dmabuf_len(be->dh[i]);
++            if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
++                planes[i].m.fd = dmabuf_fd(be->dh[i]);
++            else
++                planes[i].m.mem_offset = 0;
++        }
++        buffer.m.planes = planes;
++        buffer.length = i;
++    }
++    else {
++        if (is_dst)
++            dmabuf_len_set(be->dh[0], 0);
++
++        buffer.bytesused = dmabuf_len(be->dh[0]);
++        buffer.length = dmabuf_size(be->dh[0]);
++        if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
++            buffer.m.fd = dmabuf_fd(be->dh[0]);
++        else
++            buffer.m.offset = 0;
++    }
++
++    if (!is_dst && mreq) {
++        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
++        buffer.request_fd = media_request_fd(mreq);
++        if (hold_flag)
++            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
++    }
++
++    if (is_dst)
++        be->timestamp = (struct timeval){0,0};
++
++    buffer.timestamp = be->timestamp;
++
++    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
++        const int err = errno;
++        if (err != EINTR) {
++            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
++            return -err;
++        }
++    }
++    return 0;
++}
++
++static struct qent_base * qe_dequeue(struct buf_pool *const bp,
++                     const int vfd,
++                     const struct v4l2_format * const f)
++{
++    struct qent_base *be;
++    int rc;
++    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++    struct v4l2_buffer buffer = {
++        .type =  f->type,
++        .memory = mediabufs_memory_to_v4l2(bp->memtype)
++    };
++    if (mp) {
++        buffer.length = f->fmt.pix_mp.num_planes;
++        buffer.m.planes = planes;
++    }
++
++    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
++           errno == EINTR)
++        /* Loop */;
++    if (rc) {
++        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
++        return NULL;
++    }
++
++    be = queue_find_extract_index(bp, buffer.index);
++    if (!be) {
++        request_log("Failed to find index %d in Q\n", buffer.index);
++        return NULL;
++    }
++
++    if (mp) {
++        unsigned int i;
++        for (i = 0; i != buffer.length; ++i)
++            dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0);
++    }
++    else
++        dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0);
++
++    be->timestamp = buffer.timestamp;
++    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
++    return be;
++}
++
++static void qe_dst_done(struct qent_dst * dst_be)
++{
++    pthread_mutex_lock(&dst_be->lock);
++    dst_be->waiting = false;
++    pthread_cond_broadcast(&dst_be->cond);
++    pthread_mutex_unlock(&dst_be->lock);
++
++    qent_dst_unref(&dst_be);
++}
++
++static bool qe_dst_waiting(struct qent_dst *const dst_be)
++{
++    bool waiting;
++    pthread_mutex_lock(&dst_be->lock);
++    waiting = dst_be->waiting;
++    dst_be->waiting = true;
++    pthread_mutex_unlock(&dst_be->lock);
++    return waiting;
++}
++
++
++static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
++{
++    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
++}
++
++static void mediabufs_poll_cb(void * v, short revents)
++{
++    struct mediabufs_ctl *mbc = v;
++    struct qent_src *src_be = NULL;
++    struct qent_dst *dst_be = NULL;
++
++    if (!revents)
++        request_err(mbc->dc, "%s: Timeout\n", __func__);
++
++    pthread_mutex_lock(&mbc->lock);
++    mbc->polling = false;
++
++    if ((revents & POLLOUT) != 0)
++        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
++    if ((revents & POLLIN) != 0)
++        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
++
++    /* Reschedule */
++    if (mediabufs_wants_poll(mbc)) {
++        mbc->polling = true;
++        pollqueue_add_task(mbc->pt, 2000);
++    }
++    pthread_mutex_unlock(&mbc->lock);
++
++    if (src_be)
++        queue_put_free(mbc->src, &src_be->base);
++    if (dst_be)
++        qe_dst_done(dst_be);
++}
++
++int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
++{
++    struct qent_base *const be = &be_src->base;
++
++    be->timestamp = *timestamp;
++    return 0;
++}
++
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
++{
++    return be_dst->base.timestamp;
++}
++
++static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
++{
++    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
++        size_t newsize = round_up_size(len);
++        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
++        if (!dbsc) {
++            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
++            return -ENOMEM;
++        }
++        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
++            request_log("%s: Realloc %zd failed\n", __func__, newsize);
++            return -ENOMEM;
++        }
++    }
++    return 0;
++}
++
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++    struct qent_base *const be = &be_src->base;
++    return qent_base_realloc(be, len, dbsc);
++}
++
++
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++    void * dst;
++    struct qent_base *const be = &be_src->base;
++    int rv;
++
++    // Realloc doesn't copy so don't alloc if offset != 0
++    if ((rv = qent_base_realloc(be, offset + len,
++                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
++        return rv;
++
++    dmabuf_write_start(be->dh[0]);
++    dst = dmabuf_map(be->dh[0]);
++    if (!dst)
++        return -1;
++    memcpy((char*)dst + offset, src, len);
++    dmabuf_len_set(be->dh[0], len);
++    dmabuf_write_end(be->dh[0]);
++    return 0;
++}
++
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
++{
++    const struct qent_base *const be = &be_dst->base;
++
++    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
++}
++
++int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
++{
++    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
++}
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++                struct media_request **const pmreq,
++                struct qent_src **const psrc_be,
++                struct qent_dst *const dst_be,
++                const bool is_final)
++{
++    struct media_request * mreq = *pmreq;
++    struct qent_src *const src_be = *psrc_be;
++
++    // Req & src are always both "consumed"
++    *pmreq = NULL;
++    *psrc_be = NULL;
++
++    pthread_mutex_lock(&mbc->lock);
++
++    if (!src_be)
++        goto fail1;
++
++    if (dst_be) {
++        if (qe_dst_waiting(dst_be)) {
++            request_info(mbc->dc, "Request buffer already waiting on start\n");
++            goto fail1;
++        }
++        dst_be->base.timestamp = (struct timeval){0,0};
++        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
++            goto fail1;
++
++        qent_dst_ref(dst_be);
++        queue_put_inuse(mbc->dst, &dst_be->base);
++    }
++
++    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
++        goto fail1;
++    queue_put_inuse(mbc->src, &src_be->base);
++
++    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
++        mbc->polling = true;
++        pollqueue_add_task(mbc->pt, 2000);
++    }
++    pthread_mutex_unlock(&mbc->lock);
++
++    if (media_request_start(mreq))
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    return MEDIABUFS_STATUS_SUCCESS;
++
++fail1:
++    media_request_abort(&mreq);
++    if (src_be)
++        queue_put_free(mbc->src, &src_be->base);
++
++// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
++    if (dst_be) {
++        dst_be->base.status = QENT_ERROR;
++        qe_dst_done(dst_be);
++    }
++    pthread_mutex_unlock(&mbc->lock);
++    return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++static int qe_alloc_from_fmt(struct qent_base *const be,
++                   struct dmabufs_ctl *const dbsc,
++                   const struct v4l2_format *const fmt)
++{
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        unsigned int i;
++        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
++            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
++                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
++            /* On failure tidy up and die */
++            if (!be->dh[i]) {
++                while (i--) {
++                    dmabuf_free(be->dh[i]);
++                    be->dh[i] = NULL;
++                }
++                return -1;
++            }
++        }
++    }
++    else {
++//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
++        size_t size = fmt->fmt.pix.sizeimage;
++        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
++        if (!be->dh[0])
++            return -1;
++    }
++    return 0;
++}
++
++static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
++            const enum v4l2_buf_type buftype,
++            uint32_t pixfmt,
++            const unsigned int width, const unsigned int height,
++                               const size_t bufsize)
++{
++    *fmt = (struct v4l2_format){.type = buftype};
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++        fmt->fmt.pix_mp.width = width;
++        fmt->fmt.pix_mp.height = height;
++        fmt->fmt.pix_mp.pixelformat = pixfmt;
++        if (bufsize) {
++            fmt->fmt.pix_mp.num_planes = 1;
++            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
++        }
++    }
++    else {
++        fmt->fmt.pix.width = width;
++        fmt->fmt.pix.height = height;
++        fmt->fmt.pix.pixelformat = pixfmt;
++        fmt->fmt.pix.sizeimage = bufsize;
++    }
++
++    while (ioctl(fd, VIDIOC_S_FMT, fmt))
++        if (errno != EINTR)
++            return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    // Treat anything where we don't get at least what we asked for as a fail
++    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++        if (fmt->fmt.pix_mp.width < width ||
++            fmt->fmt.pix_mp.height < height ||
++            fmt->fmt.pix_mp.pixelformat != pixfmt) {
++            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++        }
++    }
++    else {
++        if (fmt->fmt.pix.width < width ||
++            fmt->fmt.pix.height < height ||
++            fmt->fmt.pix.pixelformat != pixfmt) {
++            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++        }
++    }
++
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
++                   const int fd,
++                   const unsigned int type_v4l2,
++                   const uint32_t flags_must,
++                   const uint32_t flags_not,
++                   const unsigned int width,
++                   const unsigned int height,
++                   mediabufs_dst_fmt_accept_fn *const accept_fn,
++                   void *const accept_v)
++{
++    unsigned int i;
++
++    for (i = 0;; ++i) {
++        struct v4l2_fmtdesc fmtdesc = {
++            .index = i,
++            .type = type_v4l2
++        };
++        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++            if (errno != EINTR)
++                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++        }
++        if ((fmtdesc.flags & flags_must) != flags_must ||
++            (fmtdesc.flags & flags_not))
++            continue;
++        if (!accept_fn(accept_v, &fmtdesc))
++            continue;
++
++        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
++                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
++            return MEDIABUFS_STATUS_SUCCESS;
++    }
++    return 0;
++}
++
++
++/* Wait for qent done */
++
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
++{
++    struct qent_base *const be = &be_dst->base;
++    enum qent_status estat;
++
++    pthread_mutex_lock(&be_dst->lock);
++    while (be_dst->waiting &&
++           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
++        /* Loop */;
++    estat = be->status;
++    pthread_mutex_unlock(&be_dst->lock);
++
++    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
++        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
++            MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
++{
++    struct qent_base *const be = &be_dst->base;
++    return dmabuf_map(be->dh[buf_no]);
++}
++
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
++{
++    struct qent_base *const be = &be_dst->base;
++    unsigned int i;
++    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++        if (dmabuf_read_start(be->dh[i])) {
++            while (i--)
++                dmabuf_read_end(be->dh[i]);
++            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++        }
++    }
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
++{
++    struct qent_base *const be = &be_dst->base;
++    unsigned int i;
++    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++        if (dmabuf_read_end(be->dh[i]))
++            status = MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++    return status;
++}
++
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
++{
++    if (be_dst)
++        atomic_fetch_add(&be_dst->base.ref_count, 1);
++    return be_dst;
++}
++
++void qent_dst_unref(struct qent_dst ** const pbe_dst)
++{
++    struct qent_dst * const be_dst = *pbe_dst;
++    struct mediabufs_ctl * mbc;
++    if (!be_dst)
++        return;
++    *pbe_dst = NULL;
++
++    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
++        return;
++
++    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
++        queue_put_free(mbc->dst, &be_dst->base);
++        ff_weak_link_unlock(be_dst->mbc_wl);
++    }
++    else {
++        qe_dst_free(be_dst);
++    }
++}
++
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++                unsigned int plane,
++                int fd, size_t size)
++{
++    struct qent_base *const be = &be_dst->base;
++    struct dmabuf_h * dh;
++
++    if (be->status != QENT_IMPORT || be->dh[plane])
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    dh = dmabuf_import(fd, size);
++    if (!dh)
++        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++    be->dh[plane] = dh;
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++// Returns noof buffers created, -ve for error
++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
++{
++    unsigned int i;
++
++    struct v4l2_create_buffers cbuf = {
++        .count = n,
++        .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype),
++        .format = mbc->dst_fmt,
++    };
++
++    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
++        const int err = -errno;
++        if (err != EINTR) {
++            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
++            return -err;
++        }
++    }
++
++    if (cbuf.count != n)
++        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
++
++    for (i = 0; i != cbuf.count; ++i)
++        qes[i]->base.index = cbuf.index + i;
++
++    return cbuf.count;
++}
++
++static MediaBufsStatus
++qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt,
++                   const unsigned int n, const bool x_dmabuf)
++{
++    struct v4l2_buffer buf = {
++        .index = n,
++        .type = fmt->type,
++    };
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    int ret;
++
++    if (be->dh[0])
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        memset(planes, 0, sizeof(planes));
++        buf.m.planes = planes;
++        buf.length = VIDEO_MAX_PLANES;
++    }
++
++    if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) {
++        request_err(mbc->dc, "VIDIOC_QUERYBUF failed");
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type))
++    {
++        unsigned int i;
++        for (i = 0; i != buf.length; ++i) {
++            if (x_dmabuf) {
++                struct v4l2_exportbuffer xbuf = {
++                    .type = buf.type,
++                    .index = buf.index,
++                    .plane = i,
++                    .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
++                };
++                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
++                    be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
++            }
++            else {
++                be->dh[i] = dmabuf_import_mmap(
++                    mmap(NULL, planes[i].length,
++                        PROT_READ | PROT_WRITE,
++                        MAP_SHARED | MAP_POPULATE,
++                        mbc->vfd, planes[i].m.mem_offset),
++                    planes[i].length);
++            }
++            /* On failure tidy up and die */
++            if (!be->dh[i]) {
++                while (i--) {
++                    dmabuf_free(be->dh[i]);
++                    be->dh[i] = NULL;
++                }
++                return MEDIABUFS_ERROR_OPERATION_FAILED;
++            }
++        }
++    }
++    else
++    {
++        if (x_dmabuf) {
++            struct v4l2_exportbuffer xbuf = {
++                .type = buf.type,
++                .index = buf.index,
++                .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
++            };
++            if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
++                be->dh[0] = dmabuf_import(xbuf.fd, buf.length);
++        }
++        else {
++            be->dh[0] = dmabuf_import_mmap(
++                mmap(NULL, buf.length,
++                    PROT_READ | PROT_WRITE,
++                    MAP_SHARED | MAP_POPULATE,
++                    mbc->vfd, buf.m.offset),
++                buf.length);
++        }
++        /* On failure tidy up and die */
++        if (!be->dh[0]) {
++            return MEDIABUFS_ERROR_OPERATION_FAILED;
++        }
++    }
++
++    return 0;
++}
++
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
++{
++    struct qent_dst * be_dst;
++
++    if (mbc == NULL) {
++        be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF);
++        if (be_dst)
++            be_dst->base.status = QENT_IMPORT;
++        return be_dst;
++    }
++
++    if (mbc->dst_fixed) {
++        be_dst = base_to_dst(queue_get_free(mbc->dst));
++        if (!be_dst)
++            return NULL;
++    }
++    else {
++        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
++        if (!be_dst) {
++            be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype);
++            if (!be_dst)
++                return NULL;
++
++            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
++                qe_dst_free(be_dst);
++                return NULL;
++            }
++        }
++    }
++
++    if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) {
++        if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) {
++            request_err(mbc->dc, "Failed to export as dmabuf\n");
++            queue_put_free(mbc->dst, &be_dst->base);
++            return NULL;
++        }
++    }
++    else {
++        if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
++            /* Given  how create buf works we can't uncreate it on alloc failure
++             * all we can do is put it on the free Q
++            */
++            queue_put_free(mbc->dst, &be_dst->base);
++            return NULL;
++        }
++    }
++
++    be_dst->base.status = QENT_PENDING;
++    atomic_store(&be_dst->base.ref_count, 0);
++    return be_dst;
++}
++
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
++{
++    return &mbc->dst_fmt;
++}
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++               const unsigned int width,
++               const unsigned int height,
++               mediabufs_dst_fmt_accept_fn *const accept_fn,
++               void *const accept_v)
++{
++    MediaBufsStatus status;
++    unsigned int i;
++    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
++    static const struct {
++        unsigned int flags_must;
++        unsigned int flags_not;
++    } trys[] = {
++        {0, V4L2_FMT_FLAG_EMULATED},
++        {V4L2_FMT_FLAG_EMULATED, 0},
++    };
++    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
++        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
++                                buf_type,
++                                trys[i].flags_must,
++                                trys[i].flags_not,
++                                width, height, accept_fn, accept_v);
++        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
++            return status;
++    }
++
++    if (status != MEDIABUFS_STATUS_SUCCESS)
++        return status;
++
++    /* Try to create a buffer - don't alloc */
++    return status;
++}
++
++// ** This is a mess if we get partial alloc but without any way to remove
++//    individual V4L2 Q members we are somewhat stuffed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype)
++{
++    unsigned int i;
++    int a = 0;
++    unsigned int qc;
++    struct qent_dst * qes[32];
++
++    if (n > 32)
++        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++    mbc->dst->memtype = memtype;
++
++    // Create qents first as it is hard to get rid of the V4L2 buffers on error
++    for (qc = 0; qc != n; ++qc)
++    {
++        if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL)
++            goto fail;
++    }
++
++    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
++        goto fail;
++
++    for (i = 0; i != a; ++i)
++        queue_put_free(mbc->dst, &qes[i]->base);
++
++    if (a != n)
++        goto fail;
++
++    mbc->dst_fixed = fixed;
++    return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++    for (i = (a < 0 ? 0 : a); i != qc; ++i)
++        qe_dst_free(qes[i]);
++
++    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++}
++
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
++{
++    struct qent_base * buf = queue_get_free(mbc->src);
++    buf->status = QENT_PENDING;
++    return base_to_src(buf);
++}
++
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
++{
++    struct qent_src *const qe_src = *pqe_src;
++    if (!qe_src)
++        return;
++    *pqe_src = NULL;
++    queue_put_free(mbc->src, &qe_src->base);
++}
++
++static MediaBufsStatus
++chk_memory_type(struct mediabufs_ctl *const mbc,
++    const struct v4l2_format * const f,
++    const enum mediabufs_memory m)
++{
++    struct v4l2_create_buffers cbuf = {
++        .count = 0,
++        .memory = V4L2_MEMORY_MMAP,
++        .format = *f
++    };
++
++    if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0)
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    switch (m) {
++    case MEDIABUFS_MEMORY_DMABUF:
++        // 0 = Unknown but assume not in that case
++        if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0)
++            return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
++        break;
++    case MEDIABUFS_MEMORY_MMAP:
++        break;
++    default:
++        return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
++    }
++
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus
++mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
++{
++    return chk_memory_type(mbc, &mbc->src_fmt, memtype);
++}
++
++MediaBufsStatus
++mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
++{
++    return chk_memory_type(mbc, &mbc->dst_fmt, memtype);
++}
++
++/* src format must have been set up before this */
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
++                  struct dmabufs_ctl * const dbsc,
++                  unsigned int n, const enum mediabufs_memory memtype)
++{
++    unsigned int i;
++    struct v4l2_requestbuffers req = {
++        .count = n,
++        .type = mbc->src_fmt.type,
++        .memory = mediabufs_memory_to_v4l2(memtype)
++    };
++
++    bq_free_all_free_src(mbc->src);
++
++    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
++        if (errno != EINTR) {
++            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
++            return MEDIABUFS_ERROR_OPERATION_FAILED;
++        }
++    }
++
++    if (n > req.count) {
++        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
++        n = req.count;
++    }
++
++    for (i = 0; i != n; ++i) {
++        struct qent_src *const be_src = qe_src_new(memtype);
++        if (!be_src) {
++            request_err(mbc->dc, "Failed to create src be %d\n", i);
++            goto fail;
++        }
++        switch (memtype) {
++        case MEDIABUFS_MEMORY_MMAP:
++            if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) {
++                qe_src_free(be_src);
++                goto fail;
++            }
++            be_src->fixed_size = 1;
++            break;
++        case MEDIABUFS_MEMORY_DMABUF:
++            if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
++                qe_src_free(be_src);
++                goto fail;
++            }
++            be_src->fixed_size = !mediabufs_src_resizable(mbc);
++            break;
++        default:
++            request_err(mbc->dc, "Unexpected memorty type\n");
++            goto fail;
++        }
++        be_src->base.index = i;
++
++        queue_put_free(mbc->src, &be_src->base);
++    }
++
++    mbc->src->memtype = memtype;
++    return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++    bq_free_all_free_src(mbc->src);
++    req.count = 0;
++    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
++           errno == EINTR)
++        /* Loop */;
++
++    return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++
++/*
++ * Set stuff order:
++ *  Set src fmt
++ *  Set parameters (sps) on vfd
++ *  Negotiate dst format (dst_fmt_set)
++ *  Create src buffers
++ *  Alloc a dst buffer or Create dst slots
++*/
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
++{
++    if (mbc->stream_on)
++        return MEDIABUFS_STATUS_SUCCESS;
++
++    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
++        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
++        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
++        set_stream(mbc->vfd, mbc->src_fmt.type, false);
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    mbc->stream_on = true;
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
++{
++    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++    if (!mbc->stream_on)
++        return MEDIABUFS_STATUS_SUCCESS;
++
++    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
++        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
++        status = MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
++        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
++        status = MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    mbc->stream_on = false;
++    return status;
++}
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
++{
++    struct v4l2_ext_controls controls = {
++        .controls = control_array,
++        .count = n
++    };
++
++    if (mreq) {
++        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
++        controls.request_fd = media_request_fd(mreq);
++    }
++
++    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
++    {
++        const int err = errno;
++        if (err != EINTR) {
++            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
++            return -err;
++        }
++    }
++
++    return 0;
++}
++
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++                struct media_request * const mreq,
++                unsigned int id, void *data,
++                unsigned int size)
++{
++    struct v4l2_ext_control control = {
++        .id = id,
++        .ptr = data,
++        .size = size
++    };
++
++    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
++    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++                                      enum v4l2_buf_type buf_type,
++                   const uint32_t pixfmt,
++                   const uint32_t width, const uint32_t height,
++                                      const size_t bufsize)
++{
++    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
++    if (rv != MEDIABUFS_STATUS_SUCCESS)
++        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
++
++    return rv;
++}
++
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
++{
++    int rv = 0;
++    while (n--) {
++        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
++            const int err = errno;
++            if (err != EINTR) {
++                // Often used for probing - errors are to be expected
++                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
++                ctrls->type = 0; // 0 is invalid
++                rv = -err;
++                break;
++            }
++        }
++        ++ctrls;
++    }
++    return rv;
++}
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
++{
++#if 1
++    return 0;
++#else
++    // Single planar OUTPUT can only take exact size buffers
++    // Multiplanar will take larger than negotiated
++    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
++#endif
++}
++
++static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
++{
++    if (!mbc)
++        return;
++
++    // Break the weak link first
++    ff_weak_link_break(&mbc->this_wlm);
++
++    polltask_delete(&mbc->pt);
++
++    mediabufs_stream_off(mbc);
++
++    // Empty v4l2 buffer stash
++    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
++    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
++
++    bq_free_all_free_src(mbc->src);
++    bq_free_all_inuse_src(mbc->src);
++    bq_free_all_free_dst(mbc->dst);
++
++    {
++        struct qent_dst *dst_be;
++        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
++            dst_be->base.timestamp = (struct timeval){0};
++            dst_be->base.status = QENT_ERROR;
++            qe_dst_done(dst_be);
++        }
++    }
++
++    queue_delete(mbc->dst);
++    queue_delete(mbc->src);
++    close(mbc->vfd);
++    pthread_mutex_destroy(&mbc->lock);
++
++    free(mbc);
++}
++
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
++{
++    atomic_fetch_add(&mbc->ref_count, 1);
++    return mbc;
++}
++
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
++{
++    struct mediabufs_ctl *const mbc = *pmbc;
++    int n;
++
++    if (!mbc)
++        return;
++    *pmbc = NULL;
++    n = atomic_fetch_sub(&mbc->ref_count, 1);
++    if (n)
++        return;
++    mediabufs_ctl_delete(mbc);
++}
++
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
++{
++    return mbc->capability.version;
++}
++
++static int set_capabilities(struct mediabufs_ctl *const mbc)
++{
++    uint32_t caps;
++
++    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
++        int err = errno;
++        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
++        return -err;
++    }
++
++    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
++            mbc->capability.device_caps :
++            mbc->capability.capabilities;
++
++    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
++        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++    }
++    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
++        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++    }
++    else {
++        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
++        return -EINVAL;
++    }
++
++    return 0;
++}
++
++/* One of these per context */
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
++{
++    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
++
++    if (!mbc)
++        return NULL;
++
++    mbc->dc = dc;
++    // Default mono planar
++    mbc->pq = pq;
++    pthread_mutex_init(&mbc->lock, NULL);
++
++    /* Pick a default  - could we scan for this? */
++    if (vpath == NULL)
++        vpath = "/dev/media0";
++
++    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
++    {
++        const int err = errno;
++        if (err != EINTR) {
++            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
++            goto fail0;
++        }
++    }
++
++    if (set_capabilities(mbc)) {
++        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
++        goto fail1;
++    }
++
++    mbc->src = queue_new(mbc->vfd);
++    if (!mbc->src)
++        goto fail1;
++    mbc->dst = queue_new(mbc->vfd);
++    if (!mbc->dst)
++        goto fail2;
++    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
++    if (!mbc->pt)
++        goto fail3;
++    mbc->this_wlm = ff_weak_link_new(mbc);
++    if (!mbc->this_wlm)
++        goto fail4;
++
++    /* Cannot add polltask now - polling with nothing pending
++     * generates infinite error polls
++    */
++    return mbc;
++
++fail4:
++    polltask_delete(&mbc->pt);
++fail3:
++    queue_delete(mbc->dst);
++fail2:
++    queue_delete(mbc->src);
++fail1:
++    close(mbc->vfd);
++fail0:
++    free(mbc);
++    request_info(dc, "%s: FAILED\n", __func__);
++    return NULL;
++}
++
++
++
+diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
+new file mode 100644
+index 0000000000..890947b2e2
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.h
+@@ -0,0 +1,171 @@
++/*
++e.h
++*
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef _MEDIA_H_
++#define _MEDIA_H_
++
++#include <stdbool.h>
++#include <stdint.h>
++
++struct v4l2_format;
++struct v4l2_fmtdesc;
++struct v4l2_query_ext_ctrl;
++
++struct pollqueue;
++struct media_request;
++struct media_pool;
++
++typedef enum media_buf_status {
++    MEDIABUFS_STATUS_SUCCESS = 0,
++    MEDIABUFS_ERROR_OPERATION_FAILED,
++    MEDIABUFS_ERROR_DECODING_ERROR,
++    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
++    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
++    MEDIABUFS_ERROR_ALLOCATION_FAILED,
++    MEDIABUFS_ERROR_UNSUPPORTED_MEMORY,
++} MediaBufsStatus;
++
++struct media_pool * media_pool_new(const char * const media_path,
++                   struct pollqueue * const pq,
++                   const unsigned int n);
++void media_pool_delete(struct media_pool ** pmp);
++
++// Obtain a media request
++// Will block if none availible - has a 2sec timeout
++struct media_request * media_request_get(struct media_pool * const mp);
++int media_request_fd(const struct media_request * const req);
++
++// Start this request
++// Request structure is returned to pool once done
++int media_request_start(struct media_request * const req);
++
++// Return an *unstarted* media_request to the pool
++// May later be upgraded to allow for aborting a started req
++int media_request_abort(struct media_request ** const preq);
++
++
++struct mediabufs_ctl;
++struct qent_src;
++struct qent_dst;
++struct dmabuf_h;
++struct dmabufs_ctl;
++
++// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties
++enum mediabufs_memory {
++   MEDIABUFS_MEMORY_UNSET            = 0,
++   MEDIABUFS_MEMORY_MMAP             = 1,
++   MEDIABUFS_MEMORY_USERPTR          = 2,
++   MEDIABUFS_MEMORY_OVERLAY          = 3,
++   MEDIABUFS_MEMORY_DMABUF           = 4,
++};
++
++int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
++
++// prealloc
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
++// dbsc may be NULL if realloc not required
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
++int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
++void qent_dst_delete(struct qent_dst *const be);
++// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
++void qent_dst_unref(struct qent_dst ** const pbe_dst);
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
++
++const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
++/* Import an fd unattached to any mediabuf */
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++                unsigned int plane,
++                int fd, size_t size);
++
++const char * mediabufs_memory_name(const enum mediabufs_memory m);
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++                struct media_request **const pmreq,
++                struct qent_src **const psrc_be,
++                struct qent_dst *const dst_be,
++                const bool is_final);
++// Get / alloc a dst buffer & associate with a slot
++// If the dst pool is empty then behaviour depends on the fixed flag passed to
++// dst_slots_create.  Default is !fixed = unlimited alloc
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
++                           struct dmabufs_ctl *const dbsc);
++// Create dst slots without alloc
++// If fixed true then qent_alloc will only get slots from this pool and will
++// block until a qent has been unrefed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype);
++
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
++
++typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++               const unsigned int width,
++               const unsigned int height,
++               mediabufs_dst_fmt_accept_fn *const accept_fn,
++               void *const accept_v);
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
++                                struct v4l2_ext_control control_array[], unsigned int n);
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++                struct media_request * const mreq,
++                unsigned int id, void *data,
++                unsigned int size);
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++                                      enum v4l2_buf_type buf_type,
++                                      const uint32_t pixfmt,
++                                      const uint32_t width, const uint32_t height,
++                                      const size_t bufsize);
++
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
++                  struct dmabufs_ctl * const dbsc,
++                  unsigned int n,
++                  const enum mediabufs_memory memtype);
++
++// Want to have appropriate formats set first
++MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
++MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
++
++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
++
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
++                     const char *vpath, struct pollqueue *const pq);
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
++
++
++#endif
+diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c
+new file mode 100644
+index 0000000000..cc8a5d4001
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.c
+@@ -0,0 +1,361 @@
++#include <errno.h>
++#include <limits.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <string.h>
++#include <unistd.h>
++#include <sys/eventfd.h>
++
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++
++
++struct pollqueue;
++
++enum polltask_state {
++    POLLTASK_UNQUEUED = 0,
++    POLLTASK_QUEUED,
++    POLLTASK_RUNNING,
++    POLLTASK_Q_KILL,
++    POLLTASK_RUN_KILL,
++};
++
++struct polltask {
++    struct polltask *next;
++    struct polltask *prev;
++    struct pollqueue *q;
++    enum polltask_state state;
++
++    int fd;
++    short events;
++
++    void (*fn)(void *v, short revents);
++    void * v;
++
++    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
++    sem_t kill_sem;
++};
++
++struct pollqueue {
++    atomic_int ref_count;
++    pthread_mutex_t lock;
++
++    struct polltask *head;
++    struct polltask *tail;
++
++    bool kill;
++    bool no_prod;
++    int prod_fd;
++    struct polltask *prod_pt;
++    pthread_t worker;
++};
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++                              const int fd, const short events,
++                  void (*const fn)(void *v, short revents),
++                  void *const v)
++{
++    struct polltask *pt;
++
++    if (!events)
++        return NULL;
++
++    pt = malloc(sizeof(*pt));
++    if (!pt)
++        return NULL;
++
++    *pt = (struct polltask){
++        .next = NULL,
++        .prev = NULL,
++        .q = pollqueue_ref(pq),
++        .fd = fd,
++        .events = events,
++        .fn = fn,
++        .v = v
++    };
++
++    sem_init(&pt->kill_sem, 0, 0);
++
++    return pt;
++}
++
++static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
++{
++    if (pt->prev)
++        pt->prev->next = pt->next;
++    else
++        pq->head = pt->next;
++    if (pt->next)
++        pt->next->prev = pt->prev;
++    else
++        pq->tail = pt->prev;
++    pt->next = NULL;
++    pt->prev = NULL;
++}
++
++static void polltask_free(struct polltask * const pt)
++{
++    sem_destroy(&pt->kill_sem);
++    free(pt);
++}
++
++static int pollqueue_prod(const struct pollqueue *const pq)
++{
++    static const uint64_t one = 1;
++    return write(pq->prod_fd, &one, sizeof(one));
++}
++
++void polltask_delete(struct polltask **const ppt)
++{
++    struct polltask *const pt = *ppt;
++    struct pollqueue * pq;
++    enum polltask_state state;
++    bool prodme;
++
++    if (!pt)
++        return;
++
++    pq = pt->q;
++    pthread_mutex_lock(&pq->lock);
++    state = pt->state;
++    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
++    prodme = !pq->no_prod;
++    pthread_mutex_unlock(&pq->lock);
++
++    if (state != POLLTASK_UNQUEUED) {
++        if (prodme)
++            pollqueue_prod(pq);
++        while (sem_wait(&pt->kill_sem) && errno == EINTR)
++            /* loop */;
++    }
++
++    // Leave zapping the ref until we have DQed the PT as might well be
++    // legitimately used in it
++    *ppt = NULL;
++    polltask_free(pt);
++    pollqueue_unref(&pq);
++}
++
++static uint64_t pollqueue_now(int timeout)
++{
++    struct timespec now;
++    uint64_t now_ms;
++
++    if (clock_gettime(CLOCK_MONOTONIC, &now))
++        return 0;
++    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
++    return now_ms ? now_ms : (uint64_t)1;
++}
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout)
++{
++    bool prodme = false;
++    struct pollqueue * const pq = pt->q;
++
++    pthread_mutex_lock(&pq->lock);
++    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
++        if (pq->tail)
++            pq->tail->next = pt;
++        else
++            pq->head = pt;
++        pt->prev = pq->tail;
++        pt->next = NULL;
++        pt->state = POLLTASK_QUEUED;
++        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
++        pq->tail = pt;
++        prodme = !pq->no_prod;
++    }
++    pthread_mutex_unlock(&pq->lock);
++    if (prodme)
++        pollqueue_prod(pq);
++}
++
++static void *poll_thread(void *v)
++{
++    struct pollqueue *const pq = v;
++    struct pollfd *a = NULL;
++    size_t asize = 0;
++
++    pthread_mutex_lock(&pq->lock);
++    do {
++        unsigned int i;
++        unsigned int n = 0;
++        struct polltask *pt;
++        struct polltask *pt_next;
++        uint64_t now = pollqueue_now(0);
++        int timeout = -1;
++        int rv;
++
++        for (pt = pq->head; pt; pt = pt_next) {
++            int64_t t;
++
++            pt_next = pt->next;
++
++            if (pt->state == POLLTASK_Q_KILL) {
++                pollqueue_rem_task(pq, pt);
++                sem_post(&pt->kill_sem);
++                continue;
++            }
++
++            if (n >= asize) {
++                asize = asize ? asize * 2 : 4;
++                a = realloc(a, asize * sizeof(*a));
++                if (!a) {
++                    request_log("Failed to realloc poll array to %zd\n", asize);
++                    goto fail_locked;
++                }
++            }
++
++            a[n++] = (struct pollfd){
++                .fd = pt->fd,
++                .events = pt->events
++            };
++
++            t = (int64_t)(pt->timeout - now);
++            if (pt->timeout && t < INT_MAX &&
++                (timeout < 0 || (int)t < timeout))
++                timeout = (t < 0) ? 0 : (int)t;
++        }
++        pthread_mutex_unlock(&pq->lock);
++
++        if ((rv = poll(a, n, timeout)) == -1) {
++            if (errno != EINTR) {
++                request_log("Poll error: %s\n", strerror(errno));
++                goto fail_unlocked;
++            }
++        }
++
++        pthread_mutex_lock(&pq->lock);
++        now = pollqueue_now(0);
++
++        /* Prodding in this loop is pointless and might lead to
++         * infinite looping
++        */
++        pq->no_prod = true;
++        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
++            pt_next = pt->next;
++
++            /* Pending? */
++            if (a[i].revents ||
++                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
++                pollqueue_rem_task(pq, pt);
++                if (pt->state == POLLTASK_QUEUED)
++                    pt->state = POLLTASK_RUNNING;
++                if (pt->state == POLLTASK_Q_KILL)
++                    pt->state = POLLTASK_RUN_KILL;
++                pthread_mutex_unlock(&pq->lock);
++
++                /* This can add new entries to the Q but as
++                 * those are added to the tail our existing
++                 * chain remains intact
++                */
++                pt->fn(pt->v, a[i].revents);
++
++                pthread_mutex_lock(&pq->lock);
++                if (pt->state == POLLTASK_RUNNING)
++                    pt->state = POLLTASK_UNQUEUED;
++                if (pt->state == POLLTASK_RUN_KILL)
++                    sem_post(&pt->kill_sem);
++            }
++        }
++        pq->no_prod = false;
++
++    } while (!pq->kill);
++
++fail_locked:
++    pthread_mutex_unlock(&pq->lock);
++fail_unlocked:
++    free(a);
++    return NULL;
++}
++
++static void prod_fn(void *v, short revents)
++{
++    struct pollqueue *const pq = v;
++    char buf[8];
++    if (revents)
++        read(pq->prod_fd, buf, 8);
++    if (!pq->kill)
++        pollqueue_add_task(pq->prod_pt, -1);
++}
++
++struct pollqueue * pollqueue_new(void)
++{
++    struct pollqueue *pq = malloc(sizeof(*pq));
++    if (!pq)
++        return NULL;
++    *pq = (struct pollqueue){
++        .ref_count = ATOMIC_VAR_INIT(0),
++        .lock = PTHREAD_MUTEX_INITIALIZER,
++        .head = NULL,
++        .tail = NULL,
++        .kill = false,
++        .prod_fd = -1
++    };
++
++    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
++    if (pq->prod_fd == 1)
++        goto fail1;
++    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
++    if (!pq->prod_pt)
++        goto fail2;
++    pollqueue_add_task(pq->prod_pt, -1);
++    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
++        goto fail3;
++    // Reset ref count which will have been inced by the add_task
++    atomic_store(&pq->ref_count, 0);
++    return pq;
++
++fail3:
++    polltask_free(pq->prod_pt);
++fail2:
++    close(pq->prod_fd);
++fail1:
++    free(pq);
++    return NULL;
++}
++
++static void pollqueue_free(struct pollqueue *const pq)
++{
++    void *rv;
++
++    pthread_mutex_lock(&pq->lock);
++    pq->kill = true;
++    pollqueue_prod(pq);
++    pthread_mutex_unlock(&pq->lock);
++
++    pthread_join(pq->worker, &rv);
++    polltask_free(pq->prod_pt);
++    pthread_mutex_destroy(&pq->lock);
++    close(pq->prod_fd);
++    free(pq);
++}
++
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
++{
++    atomic_fetch_add(&pq->ref_count, 1);
++    return pq;
++}
++
++void pollqueue_unref(struct pollqueue **const ppq)
++{
++    struct pollqueue * const pq = *ppq;
++
++    if (!pq)
++        return;
++    *ppq = NULL;
++
++    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
++        return;
++
++    pollqueue_free(pq);
++}
++
++
++
+diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h
+new file mode 100644
+index 0000000000..e1182cb2fc
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.h
+@@ -0,0 +1,18 @@
++#ifndef POLLQUEUE_H_
++#define POLLQUEUE_H_
++
++struct polltask;
++struct pollqueue;
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++			      const int fd, const short events,
++			      void (*const fn)(void *v, short revents),
++			      void *const v);
++void polltask_delete(struct polltask **const ppt);
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout);
++struct pollqueue * pollqueue_new(void);
++void pollqueue_unref(struct pollqueue **const ppq);
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
++
++#endif /* POLLQUEUE_H_ */
+diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h
+new file mode 100644
+index 0000000000..a31cc1f4ec
+--- /dev/null
++++ b/libavcodec/v4l2_req_utils.h
+@@ -0,0 +1,27 @@
++#ifndef AVCODEC_V4L2_REQ_UTILS_H
++#define AVCODEC_V4L2_REQ_UTILS_H
++
++#include <stdint.h>
++#include "libavutil/log.h"
++
++#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
++
++#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
++#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
++#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
++
++static inline char safechar(char c) {
++    return c > 0x20 && c < 0x7f ? c : '.';
++}
++
++static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
++    tbuf[0] = safechar((fcc >>  0) & 0xff);
++    tbuf[1] = safechar((fcc >>  8) & 0xff);
++    tbuf[2] = safechar((fcc >> 16) & 0xff);
++    tbuf[3] = safechar((fcc >> 24) & 0xff);
++    tbuf[4] = '\0';
++    return tbuf;
++}
++
++#endif
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+new file mode 100644
+index 0000000000..db7ed13b6d
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -0,0 +1,348 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#include "config.h"
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++#include "internal.h"
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/pixdesc.h"
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
++{
++    const size_t wxh = w * h;
++    size_t bits_alloc;
++
++    /* Annex A gives a min compression of 2 @ lvl 3.1
++     * (wxh <= 983040) and min 4 thereafter but avoid
++     * the odity of 983041 having a lower limit than
++     * 983040.
++     * Multiply by 3/2 for 4:2:0
++     */
++    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
++        wxh < 983040 * 2 ? 983040 * 3 / 4 :
++        wxh * 3 / 8;
++    /* Allow for bit depth */
++    bits_alloc += (bits_alloc * bits_minus8) / 8;
++    /* Add a few bytes (16k) for overhead */
++    bits_alloc += 0x4000;
++    return bits_alloc;
++}
++
++static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
++                                     av_unused const uint8_t *buffer,
++                                     av_unused uint32_t size)
++{
++    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->start_frame(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->decode_slice(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->end_frame(avctx);
++}
++
++static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    ctx->fns->abort_frame(avctx);
++}
++
++static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->frame_params(avctx, hw_frames_ctx);
++}
++
++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->alloc_frame(avctx, frame);
++}
++
++
++static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
++
++    mediabufs_ctl_unref(&ctx->mbufs);
++    media_pool_delete(&ctx->mpool);
++    pollqueue_unref(&ctx->pq);
++    dmabufs_ctl_unref(&ctx->dbufs);
++    devscan_delete(&ctx->devscan);
++
++    decode_q_uninit(&ctx->decode_q);
++
++//    if (avctx->hw_frames_ctx) {
++//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
++//        av_buffer_pool_flush(hwfc->pool);
++//    }
++    return 0;
++}
++
++static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
++{
++    AVCodecContext *const avctx = v;
++    const HEVCContext *const h = avctx->priv_data;
++
++    if (h->ps.sps->bit_depth == 8) {
++        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
++            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
++            return 1;
++        }
++    }
++    else if (h->ps.sps->bit_depth == 10) {
++        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++            return 1;
++        }
++    }
++    return 0;
++}
++
++static int v4l2_request_hevc_init(AVCodecContext *avctx)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    const HEVCSPS * const sps = h->ps.sps;
++    int ret;
++    const struct decdev * decdev;
++    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
++    size_t src_size;
++    enum mediabufs_memory src_memtype;
++    enum mediabufs_memory dst_memtype;
++
++    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    // Give up immediately if this is something that we have no code to deal with
++    if (h->ps.sps->chroma_format_idc != 1) {
++        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
++        return AVERROR_PATCHWELCOME;
++    }
++    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
++        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
++        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
++        return AVERROR_PATCHWELCOME;
++    }
++
++    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
++        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
++        return (AVERROR(-ret));
++    }
++    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
++
++    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
++    {
++        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
++        ret = AVERROR(ENODEV);
++        goto fail0;
++    }
++    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
++           decdev_media_path(decdev), decdev_video_path(decdev));
++
++    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
++        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
++        src_memtype = MEDIABUFS_MEMORY_MMAP;
++        dst_memtype = MEDIABUFS_MEMORY_MMAP;
++    }
++    else {
++        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
++        src_memtype = MEDIABUFS_MEMORY_DMABUF;
++        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
++    }
++
++    if ((ctx->pq = pollqueue_new()) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
++        goto fail1;
++    }
++
++    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
++        goto fail2;
++    }
++
++    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
++        goto fail3;
++    }
++
++    // Ask for an initial bitbuf size of max size / 4
++    // We will realloc if we need more
++    // Must use sps->h/w as avctx contains cropped size
++retry_src_memtype:
++    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
++    if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs))
++        src_size /= 4;
++    // Kludge for conformance tests which break Annex A limits
++    else if (src_size < 0x40000)
++        src_size = 0x40000;
++
++    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
++                              sps->width, sps->height, src_size)) {
++        char tbuf1[5];
++        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++        goto fail4;
++    }
++
++    if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) {
++        if (src_memtype == MEDIABUFS_MEMORY_DMABUF) {
++            src_memtype = MEDIABUFS_MEMORY_MMAP;
++            goto retry_src_memtype;
++        }
++        av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n");
++        goto fail4;
++    }
++
++    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
++    }
++#if CONFIG_V4L2_REQ_HEVC_VX
++    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
++    }
++    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
++    }
++    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
++    }
++#endif
++    else {
++        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
++        ret = AVERROR(EINVAL);
++        goto fail4;
++    }
++
++    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
++        char tbuf1[5];
++        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++        goto fail4;
++    }
++
++    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
++        goto fail4;
++    }
++
++    {
++        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
++            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
++        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
++               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
++               avctx->thread_count, avctx->extra_hw_frames);
++
++        if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) {
++            if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) {
++                av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n");
++                goto fail4;
++            }
++            av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n");
++            dst_memtype = MEDIABUFS_MEMORY_MMAP;
++        }
++
++        // extra_hw_frames is -1 if unset
++        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
++            goto fail4;
++        }
++    }
++
++    if (mediabufs_stream_on(ctx->mbufs)) {
++        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
++        goto fail4;
++    }
++
++    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
++        goto fail4;
++    }
++
++    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
++        goto fail5;
++    }
++
++    decode_q_init(&ctx->decode_q);
++
++    // Set our s/w format
++    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
++
++    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n",
++           ctx->fns->name,
++           decdev_media_path(decdev), decdev_video_path(decdev),
++           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype),
++           av_get_pix_fmt_name(avctx->sw_pix_fmt));
++
++    return 0;
++
++fail5:
++    av_buffer_unref(&avctx->hw_frames_ctx);
++fail4:
++    mediabufs_ctl_unref(&ctx->mbufs);
++fail3:
++    media_pool_delete(&ctx->mpool);
++fail2:
++    pollqueue_unref(&ctx->pq);
++fail1:
++    dmabufs_ctl_unref(&ctx->dbufs);
++fail0:
++    devscan_delete(&ctx->devscan);
++    return ret;
++}
++
++const AVHWAccel ff_hevc_v4l2request_hwaccel = {
++    .name           = "hevc_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .alloc_frame    = v4l2_req_hevc_alloc_frame,
++    .start_frame    = v4l2_req_hevc_start_frame,
++    .decode_slice   = v4l2_req_hevc_decode_slice,
++    .end_frame      = v4l2_req_hevc_end_frame,
++    .abort_frame    = v4l2_req_hevc_abort_frame,
++    .init           = v4l2_request_hevc_init,
++    .uninit         = v4l2_request_hevc_uninit,
++    .priv_data_size = sizeof(V4L2RequestContextHEVC),
++    .frame_params   = v4l2_req_hevc_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
+diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
+new file mode 100644
+index 0000000000..99c90064ea
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.h
+@@ -0,0 +1,102 @@
++#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
++#define AVCODEC_V4L2_REQUEST_HEVC_H
++
++#include <stdint.h>
++#include <drm_fourcc.h>
++#include "v4l2_req_decode_q.h"
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#include <linux/videodev2.h>
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in drm_fourcc.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
++#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
++#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
++#endif
++
++#define VCAT(name, version) name##_v##version
++#define V2(n,v) VCAT(n, v)
++#define V(n) V2(n, HEVC_CTRLS_VERSION)
++
++#define S2(x) #x
++#define STR(x) S2(x)
++
++// 1 per decoder
++struct v4l2_req_decode_fns;
++
++typedef struct V4L2RequestContextHEVC {
++//    V4L2RequestContext base;
++    const struct v4l2_req_decode_fns * fns;
++
++    unsigned int timestamp;  // ?? maybe uint64_t
++
++    int decode_mode;
++    int start_code;
++    unsigned int max_slices;    // 0 => not wanted (frame mode)
++    unsigned int max_offsets;   // 0 => not wanted
++
++    req_decode_q decode_q;
++
++    struct devscan *devscan;
++    struct dmabufs_ctl *dbufs;
++    struct pollqueue *pq;
++    struct media_pool * mpool;
++    struct mediabufs_ctl *mbufs;
++} V4L2RequestContextHEVC;
++
++typedef struct v4l2_req_decode_fns {
++    int src_pix_fmt_v4l2;
++    const char * name;
++
++    // Init setup
++    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++
++    // Passthrough of hwaccel fns
++    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++    int (*end_frame)(AVCodecContext *avctx);
++    void (*abort_frame)(AVCodecContext *avctx);
++    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
++} v4l2_req_decode_fns;
++
++
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
++
++#endif
+diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
+new file mode 100644
+index 0000000000..f234a985b9
+--- /dev/null
++++ b/libavcodec/weak_link.c
+@@ -0,0 +1,102 @@
++#include <stdlib.h>
++#include <pthread.h>
++#include <stdatomic.h>
++#include "weak_link.h"
++
++struct ff_weak_link_master {
++    atomic_int ref_count;    /* 0 is single ref for easier atomics */
++    pthread_rwlock_t lock;
++    void * ptr;
++};
++
++static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
++{
++    return (struct ff_weak_link_master *)c;
++}
++
++struct ff_weak_link_master * ff_weak_link_new(void * p)
++{
++    struct ff_weak_link_master * w = malloc(sizeof(*w));
++    if (!w)
++        return NULL;
++    w->ptr = p;
++    if (pthread_rwlock_init(&w->lock, NULL)) {
++        free(w);
++        return NULL;
++    }
++    return w;
++}
++
++static void weak_link_do_unref(struct ff_weak_link_master * const w)
++{
++    int n = atomic_fetch_sub(&w->ref_count, 1);
++    if (n)
++        return;
++
++    pthread_rwlock_destroy(&w->lock);
++    free(w);
++}
++
++// Unref & break link
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
++{
++    struct ff_weak_link_master * const w = *ppLink;
++    if (!w)
++        return;
++
++    *ppLink = NULL;
++    pthread_rwlock_wrlock(&w->lock);
++    w->ptr = NULL;
++    pthread_rwlock_unlock(&w->lock);
++
++    weak_link_do_unref(w);
++}
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
++{
++    if (!w)
++        return NULL;
++    atomic_fetch_add(&w->ref_count, 1);
++    return (struct ff_weak_link_client*)w;
++}
++
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
++{
++    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++    if (!w)
++        return;
++
++    *ppLink = NULL;
++    weak_link_do_unref(w);
++}
++
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
++{
++    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++
++    if (!w)
++        return NULL;
++
++    if (pthread_rwlock_rdlock(&w->lock))
++        goto broken;
++
++    if (w->ptr)
++        return w->ptr;
++
++    pthread_rwlock_unlock(&w->lock);
++
++broken:
++    *ppLink = NULL;
++    weak_link_do_unref(w);
++    return NULL;
++}
++
++// Ignores a NULL c (so can be on the return path of both broken & live links)
++void ff_weak_link_unlock(struct ff_weak_link_client * c)
++{
++    struct ff_weak_link_master * const w = weak_link_x(c);
++    if (w)
++        pthread_rwlock_unlock(&w->lock);
++}
++
++
+diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h
+new file mode 100644
+index 0000000000..415b6a27a0
+--- /dev/null
++++ b/libavcodec/weak_link.h
+@@ -0,0 +1,23 @@
++struct ff_weak_link_master;
++struct ff_weak_link_client;
++
++struct ff_weak_link_master * ff_weak_link_new(void * p);
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
++
++// Returns NULL if link broken - in this case it will also zap
++//   *ppLink and unref the weak_link.
++// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
++//
++// The above does mean that there is a race if this is called simultainiously
++// by two threads using the same weak_link_client (so don't do that)
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
++void ff_weak_link_unlock(struct ff_weak_link_client * c);
++
++
++
++
++
++
+diff --git a/libavdevice/Makefile b/libavdevice/Makefile
+index 8a62822b69..0989cb895f 100644
+--- a/libavdevice/Makefile
++++ b/libavdevice/Makefile
+@@ -48,6 +48,8 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
+ OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
+ OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
+ OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
++OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
++OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
+ OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
+ OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
+ 
+diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
+index 8a90fcb5d7..ffb410b92d 100644
+--- a/libavdevice/alldevices.c
++++ b/libavdevice/alldevices.c
+@@ -52,6 +52,8 @@ extern const FFOutputFormat ff_sndio_muxer;
+ extern const AVInputFormat  ff_v4l2_demuxer;
+ extern const FFOutputFormat ff_v4l2_muxer;
+ extern const AVInputFormat  ff_vfwcap_demuxer;
++extern const FFOutputFormat ff_vout_drm_muxer;
++extern const FFOutputFormat ff_vout_egl_muxer;
+ extern const AVInputFormat  ff_xcbgrab_demuxer;
+ extern const FFOutputFormat ff_xv_muxer;
+ 
+diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
+new file mode 100644
+index 0000000000..491e1dc608
+--- /dev/null
++++ b/libavdevice/drm_vout.c
+@@ -0,0 +1,675 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++//     limited to testing.
++
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/mux.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <unistd.h>
++
++#include <xf86drm.h>
++#include <xf86drmMode.h>
++#include <drm_fourcc.h>
++
++#define TRACE_ALL 0
++
++#define DRM_MODULE "vc4"
++
++#define ERRSTR strerror(errno)
++
++struct drm_setup {
++   int conId;
++   uint32_t crtcId;
++   int crtcIdx;
++   uint32_t planeId;
++   unsigned int out_fourcc;
++   struct {
++       int x, y, width, height;
++   } compose;
++};
++
++typedef struct drm_aux_s {
++    unsigned int fb_handle;
++    uint32_t bo_handles[AV_DRM_MAX_PLANES];
++    AVFrame * frame;
++} drm_aux_t;
++
++// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
++// we get initial flicker probably due to dodgy drm timing
++#define AUX_SIZE 3
++typedef struct drm_display_env_s
++{
++    AVClass *class;
++
++    int drm_fd;
++    uint32_t con_id;
++    struct drm_setup setup;
++    enum AVPixelFormat avfmt;
++
++    int show_all;
++    const char * drm_module;
++
++    unsigned int ano;
++    drm_aux_t aux[AUX_SIZE];
++
++    pthread_t q_thread;
++    sem_t q_sem_in;
++    sem_t q_sem_out;
++    int q_terminate;
++    AVFrame * q_next;
++
++} drm_display_env_t;
++
++
++static int drm_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++    return 0;
++}
++
++static int drm_vout_write_header(AVFormatContext *s)
++{
++    const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++    if (   s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++static int find_plane(struct AVFormatContext * const avctx,
++                      const int drmfd, const int crtcidx, const uint32_t format,
++                      uint32_t * const pplane_id)
++{
++   drmModePlaneResPtr planes;
++   drmModePlanePtr plane;
++   drmModeObjectPropertiesPtr props = NULL;
++   drmModePropertyPtr prop = NULL;
++   unsigned int i;
++   unsigned int j;
++   int ret = -1;
++
++   planes = drmModeGetPlaneResources(drmfd);
++   if (!planes)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
++       return -1;
++   }
++
++   for (i = 0; i < planes->count_planes; ++i) {
++      plane = drmModeGetPlane(drmfd, planes->planes[i]);
++      if (!planes)
++      {
++          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
++          break;
++      }
++
++      if (!(plane->possible_crtcs & (1 << crtcidx))) {
++         drmModeFreePlane(plane);
++         continue;
++      }
++
++      for (j = 0; j < plane->count_formats; ++j) {
++         if (plane->formats[j] == format)
++            break;
++      }
++
++      if (j == plane->count_formats) {
++         drmModeFreePlane(plane);
++         continue;
++      }
++
++      *pplane_id = plane->plane_id;
++      drmModeFreePlane(plane);
++      break;
++   }
++
++   if (i == planes->count_planes) {
++       ret = -1;
++       goto fail;
++   }
++
++    props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE);
++    if (!props)
++        goto fail;
++    for (i = 0; i != props->count_props; ++i) {
++        if (prop)
++            drmModeFreeProperty(prop);
++        prop = drmModeGetProperty(drmfd, props->props[i]);
++        if (!prop)
++            goto fail;
++        if (strcmp("zpos", prop->name) == 0) {
++            if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0)
++                av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]);
++            else
++                av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n");
++            break;
++        }
++    }
++
++    ret = 0;
++fail:
++    if (props)
++        drmModeFreeObjectProperties(props);
++    if (prop)
++        drmModeFreeProperty(prop);
++    drmModeFreePlaneResources(planes);
++    return ret;
++}
++
++static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
++{
++    if (da->fb_handle != 0) {
++        drmModeRmFB(de->drm_fd, da->fb_handle);
++        da->fb_handle = 0;
++    }
++
++    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
++        if (da->bo_handles[i]) {
++            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
++            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++            da->bo_handles[i] = 0;
++        }
++    }
++    av_frame_free(&da->frame);
++}
++
++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
++{
++    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++    drm_aux_t * da = de->aux + de->ano;
++    const uint32_t format = desc->layers[0].format;
++    int ret = 0;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
++#endif
++
++    if (de->setup.out_fourcc != format) {
++        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
++            av_frame_free(&frame);
++            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
++            return -1;
++        }
++        de->setup.out_fourcc = format;
++    }
++
++    {
++        drmVBlank vbl = {
++            .request = {
++                .type = DRM_VBLANK_RELATIVE,
++                .sequence = 0
++            }
++        };
++
++        while (drmWaitVBlank(de->drm_fd, &vbl)) {
++            if (errno != EINTR) {
++//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
++                break;
++            }
++        }
++    }
++
++    da_uninit(de, da);
++
++    {
++        uint32_t pitches[4] = {0};
++        uint32_t offsets[4] = {0};
++        uint64_t modifiers[4] = {0};
++        uint32_t bo_handles[4] = {0};
++        int has_mods = 0;
++        int i, j, n;
++
++        da->frame = frame;
++
++        for (i = 0; i < desc->nb_objects; ++i) {
++            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
++                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
++                return -1;
++            }
++            if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR &&
++                desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID)
++                has_mods = 1;
++        }
++
++        n = 0;
++        for (i = 0; i < desc->nb_layers; ++i) {
++            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++                pitches[n] = p->pitch;
++                offsets[n] = p->offset;
++                modifiers[n] = obj->format_modifier;
++                bo_handles[n] = da->bo_handles[p->object_index];
++                ++n;
++            }
++        }
++
++#if 1 && TRACE_ALL
++        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++               av_frame_cropped_width(frame),
++               av_frame_cropped_height(frame),
++               desc->layers[0].format,
++               bo_handles[0],
++               bo_handles[1],
++               bo_handles[2],
++               bo_handles[3],
++               pitches[0],
++               pitches[1],
++               pitches[2],
++               pitches[3],
++               offsets[0],
++               offsets[1],
++               offsets[2],
++               offsets[3],
++               (long long)modifiers[0],
++               (long long)modifiers[1],
++               (long long)modifiers[2],
++               (long long)modifiers[3]
++               );
++#endif
++
++        if (drmModeAddFB2WithModifiers(de->drm_fd,
++                                       av_frame_cropped_width(frame),
++                                       av_frame_cropped_height(frame),
++                                       desc->layers[0].format, bo_handles,
++                                       pitches, offsets,
++                                       has_mods ? modifiers : NULL,
++                                       &da->fb_handle,
++                                       has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) {
++            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
++            return -1;
++        }
++    }
++
++    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
++                              da->fb_handle, 0,
++                de->setup.compose.x, de->setup.compose.y,
++                de->setup.compose.width,
++                de->setup.compose.height,
++                0, 0,
++                av_frame_cropped_width(frame) << 16,
++                av_frame_cropped_height(frame) << 16);
++
++    if (ret != 0) {
++        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
++    }
++
++    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
++
++    return ret;
++}
++
++static int do_sem_wait(sem_t * const sem, const int nowait)
++{
++    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
++        if (errno != EINTR)
++            return -errno;
++    }
++    return 0;
++}
++
++static void * display_thread(void * v)
++{
++    AVFormatContext * const s = v;
++    drm_display_env_t * const de = s->priv_data;
++    int i;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++#endif
++
++    sem_post(&de->q_sem_out);
++
++    for (;;) {
++        AVFrame * frame;
++
++        do_sem_wait(&de->q_sem_in, 0);
++
++        if (de->q_terminate)
++            break;
++
++        frame = de->q_next;
++        de->q_next = NULL;
++        sem_post(&de->q_sem_out);
++
++        do_display(s, de, frame);
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++#endif
++
++    for (i = 0; i != AUX_SIZE; ++i)
++        da_uninit(de, de->aux + i);
++
++    av_frame_free(&de->q_next);
++
++    return NULL;
++}
++
++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    const AVFrame * const src_frame = (AVFrame *)pkt->data;
++    AVFrame * frame;
++    drm_display_env_t * const de = s->priv_data;
++    int ret;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
++        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
++        return 0;
++    }
++
++    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++        frame = av_frame_alloc();
++        av_frame_ref(frame, src_frame);
++    }
++    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++        frame = av_frame_alloc();
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        if (av_hwframe_map(frame, src_frame, 0) != 0)
++        {
++            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++            av_frame_free(&frame);
++            return AVERROR(EINVAL);
++        }
++    }
++    else {
++        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++        return AVERROR(EINVAL);
++    }
++
++    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
++    if (ret) {
++        av_frame_free(&frame);
++    }
++    else {
++        de->q_next = frame;
++        sem_post(&de->q_sem_in);
++    }
++
++    return 0;
++}
++
++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                          unsigned flags)
++{
++    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++    return AVERROR_PATCHWELCOME;
++}
++
++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
++#endif
++    switch(type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
++{
++   int ret = -1;
++   int i;
++   drmModeRes *res = drmModeGetResources(drmfd);
++   drmModeConnector *c;
++
++   if(!res)
++   {
++      printf( "drmModeGetResources failed: %s\n", ERRSTR);
++      return -1;
++   }
++
++   if (res->count_crtcs <= 0)
++   {
++      printf( "drm: no crts\n");
++      goto fail_res;
++   }
++
++   if (!s->conId) {
++      fprintf(stderr,
++         "No connector ID specified.  Choosing default from list:\n");
++
++      for (i = 0; i < res->count_connectors; i++) {
++         drmModeConnector *con =
++            drmModeGetConnector(drmfd, res->connectors[i]);
++         drmModeEncoder *enc = NULL;
++         drmModeCrtc *crtc = NULL;
++
++         if (con->encoder_id) {
++            enc = drmModeGetEncoder(drmfd, con->encoder_id);
++            if (enc->crtc_id) {
++               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
++            }
++         }
++
++         if (!s->conId && crtc) {
++            s->conId = con->connector_id;
++            s->crtcId = crtc->crtc_id;
++         }
++
++         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
++                con->connector_id,
++                crtc ? crtc->crtc_id : 0,
++                con->connector_type,
++                crtc ? crtc->width : 0,
++                crtc ? crtc->height : 0,
++                (s->conId == (int)con->connector_id ?
++            " (chosen)" : ""));
++      }
++
++      if (!s->conId) {
++         av_log(avctx, AV_LOG_ERROR,
++            "No suitable enabled connector found.\n");
++         return -1;;
++      }
++   }
++
++   s->crtcIdx = -1;
++
++   for (i = 0; i < res->count_crtcs; ++i) {
++      if (s->crtcId == res->crtcs[i]) {
++         s->crtcIdx = i;
++         break;
++      }
++   }
++
++   if (s->crtcIdx == -1)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
++       goto fail_res;
++   }
++
++   if (res->count_connectors <= 0)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
++       goto fail_res;
++   }
++
++   c = drmModeGetConnector(drmfd, s->conId);
++   if (!c)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
++       goto fail_res;
++   }
++
++   if (!c->count_modes)
++   {
++       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
++       goto fail_conn;
++   }
++
++   {
++      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
++      s->compose.x = crtc->x;
++      s->compose.y = crtc->y;
++      s->compose.width = crtc->width;
++      s->compose.height = crtc->height;
++      drmModeFreeCrtc(crtc);
++   }
++
++   if (pConId)
++      *pConId = c->connector_id;
++   ret = 0;
++
++fail_conn:
++   drmModeFreeConnector(c);
++
++fail_res:
++   drmModeFreeResources(res);
++
++   return ret;
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int drm_vout_init(struct AVFormatContext * s)
++{
++    drm_display_env_t * const de = s->priv_data;
++    int rv;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->drm_fd = -1;
++    de->con_id = 0;
++    de->setup = (struct drm_setup){0};
++    de->q_terminate = 0;
++
++    if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0)
++    {
++        rv = AVERROR(errno);
++        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv));
++        return rv;
++    }
++
++    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
++    {
++        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
++        rv = AVERROR(EINVAL);
++        goto fail_close;
++    }
++
++    sem_init(&de->q_sem_in, 0, 0);
++    sem_init(&de->q_sem_out, 0, 0);
++    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
++        rv = AVERROR(errno);
++        av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv));
++        goto fail_close;
++    }
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++    return 0;
++
++fail_close:
++    close(de->drm_fd);
++    de->drm_fd = -1;
++    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
++
++    return rv;
++}
++
++static void drm_vout_deinit(struct AVFormatContext * s)
++{
++    drm_display_env_t * const de = s->priv_data;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->q_terminate = 1;
++    sem_post(&de->q_sem_in);
++    pthread_join(de->q_thread, NULL);
++    sem_destroy(&de->q_sem_in);
++    sem_destroy(&de->q_sem_out);
++
++    for (unsigned int i = 0; i != AUX_SIZE; ++i)
++        da_uninit(de, de->aux + i);
++
++    av_frame_free(&de->q_next);
++
++    if (de->drm_fd >= 0) {
++        close(de->drm_fd);
++        de->drm_fd = -1;
++    }
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++
++#define OFFSET(x) offsetof(drm_display_env_t, x)
++static const AVOption options[] = {
++    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { NULL }
++};
++
++static const AVClass drm_vout_class = {
++    .class_name = "drm vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++FFOutputFormat ff_vout_drm_muxer = {
++    .p = {
++        .name           = "vout_drm",
++        .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
++        .audio_codec    = AV_CODEC_ID_NONE,
++        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++        .priv_class     = &drm_vout_class,
++    },
++    .priv_data_size = sizeof(drm_display_env_t),
++    .write_header   = drm_vout_write_header,
++    .write_packet   = drm_vout_write_packet,
++    .write_uncoded_frame = drm_vout_write_frame,
++    .write_trailer  = drm_vout_write_trailer,
++    .control_message = drm_vout_control_message,
++    .init           = drm_vout_init,
++    .deinit         = drm_vout_deinit,
++};
++
+diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
+new file mode 100644
+index 0000000000..afc7afd13e
+--- /dev/null
++++ b/libavdevice/egl_vout.c
+@@ -0,0 +1,783 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++//     limited to testing.
++//     Amongst other issues it doesn't wait for the pic to be displayed before
++//     returning the buffer so flikering does occur.
++
++#include <epoxy/gl.h>
++#include <epoxy/egl.h>
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/mux.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <unistd.h>
++
++#include <X11/Xlib.h>
++#include <X11/Xutil.h>
++
++#include "libavutil/rpi_sand_fns.h"
++
++#define TRACE_ALL 0
++
++struct egl_setup {
++    int conId;
++
++    Display *dpy;
++    EGLDisplay egl_dpy;
++    EGLContext ctx;
++    EGLSurface surf;
++    Window win;
++
++    uint32_t crtcId;
++    int crtcIdx;
++    uint32_t planeId;
++    struct {
++        int x, y, width, height;
++    } compose;
++};
++
++typedef struct egl_aux_s {
++    int fd;
++    GLuint texture;
++
++} egl_aux_t;
++
++typedef struct egl_display_env_s {
++    AVClass *class;
++
++    struct egl_setup setup;
++    enum AVPixelFormat avfmt;
++
++    int show_all;
++    int window_width, window_height;
++    int window_x, window_y;
++    int fullscreen;
++
++    egl_aux_t aux[32];
++
++    pthread_t q_thread;
++    pthread_mutex_t q_lock;
++    sem_t display_start_sem;
++    sem_t q_sem;
++    int q_terminate;
++    AVFrame *q_this;
++    AVFrame *q_next;
++
++} egl_display_env_t;
++
++
++/**
++ * Remove window border/decorations.
++ */
++static void
++no_border(Display *dpy, Window w)
++{
++    static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
++    static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
++
++    typedef struct {
++        unsigned long       flags;
++        unsigned long       functions;
++        unsigned long       decorations;
++        long                inputMode;
++        unsigned long       status;
++    } PropMotifWmHints;
++
++    PropMotifWmHints motif_hints;
++    Atom prop, proptype;
++    unsigned long flags = 0;
++
++    /* setup the property */
++    motif_hints.flags = MWM_HINTS_DECORATIONS;
++    motif_hints.decorations = flags;
++
++    /* get the atom for the property */
++    prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True);
++    if (!prop) {
++        /* something went wrong! */
++        return;
++    }
++
++    /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
++    proptype = prop;
++
++    XChangeProperty(dpy, w,                         /* display, window */
++                    prop, proptype,                 /* property, type */
++                    32,                             /* format: 32-bit datums */
++                    PropModeReplace,                /* mode */
++                    (unsigned char *)&motif_hints, /* data */
++                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
++                   );
++}
++
++
++/*
++ * Create an RGB, double-buffered window.
++ * Return the window and context handles.
++ */
++static int
++make_window(struct AVFormatContext *const s,
++            egl_display_env_t *const de,
++            Display *dpy, EGLDisplay egl_dpy, const char *name,
++            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
++{
++    int scrnum = DefaultScreen(dpy);
++    XSetWindowAttributes attr;
++    unsigned long mask;
++    Window root = RootWindow(dpy, scrnum);
++    Window win;
++    EGLContext ctx;
++    const int fullscreen = de->fullscreen;
++    EGLConfig config;
++    int x = de->window_x;
++    int y = de->window_y;
++    int width = de->window_width ? de->window_width : 1280;
++    int height = de->window_height ? de->window_height : 720;
++
++
++    if (fullscreen) {
++        int scrnum = DefaultScreen(dpy);
++
++        x = 0; y = 0;
++        width = DisplayWidth(dpy, scrnum);
++        height = DisplayHeight(dpy, scrnum);
++    }
++
++    {
++        EGLint num_configs;
++        static const EGLint attribs[] = {
++            EGL_RED_SIZE, 1,
++            EGL_GREEN_SIZE, 1,
++            EGL_BLUE_SIZE, 1,
++            EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
++            EGL_NONE
++        };
++
++        if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
++            av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
++            return -1;
++        }
++    }
++
++    {
++        EGLint vid;
++        if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
++            av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
++            return -1;
++        }
++
++        {
++            XVisualInfo visTemplate = {
++                .visualid = vid,
++            };
++            int num_visuals;
++            XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
++                                                  &visTemplate, &num_visuals);
++
++            /* window attributes */
++            attr.background_pixel = 0;
++            attr.border_pixel = 0;
++            attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone);
++            attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
++            /* XXX this is a bad way to get a borderless window! */
++            mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
++
++            win = XCreateWindow(dpy, root, x, y, width, height,
++                                0, visinfo->depth, InputOutput,
++                                visinfo->visual, mask, &attr);
++            XFree(visinfo);
++        }
++    }
++
++    if (fullscreen)
++        no_border(dpy, win);
++
++    /* set hints and properties */
++    {
++        XSizeHints sizehints;
++        sizehints.x = x;
++        sizehints.y = y;
++        sizehints.width  = width;
++        sizehints.height = height;
++        sizehints.flags = USSize | USPosition;
++        XSetNormalHints(dpy, win, &sizehints);
++        XSetStandardProperties(dpy, win, name, name,
++                               None, (char **)NULL, 0, &sizehints);
++    }
++
++    eglBindAPI(EGL_OPENGL_ES_API);
++
++    {
++        static const EGLint ctx_attribs[] = {
++            EGL_CONTEXT_CLIENT_VERSION, 2,
++            EGL_NONE
++        };
++        ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs);
++        if (!ctx) {
++            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++            return -1;
++        }
++    }
++
++
++    XMapWindow(dpy, win);
++
++    {
++        EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
++        if (!surf) {
++            av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
++            return -1;
++        }
++
++        if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
++            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++            return -1;
++        }
++
++        *winRet = win;
++        *ctxRet = ctx;
++        *surfRet = surf;
++    }
++
++    return 0;
++}
++
++static GLint
++compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source)
++{
++    GLuint s = glCreateShader(target);
++
++    if (s == 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
++        return 0;
++    }
++
++    glShaderSource(s, 1, (const GLchar **)&source, NULL);
++    glCompileShader(s);
++
++    {
++        GLint ok;
++        glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
++
++        if (!ok) {
++            GLchar *info;
++            GLint size;
++
++            glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
++            info = malloc(size);
++
++            glGetShaderInfoLog(s, size, NULL, info);
++            av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
++
++            return 0;
++        }
++    }
++
++    return s;
++}
++
++static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs)
++{
++    GLuint prog = glCreateProgram();
++
++    if (prog == 0) {
++        av_log(s, AV_LOG_ERROR, "Failed to create program\n");
++        return 0;
++    }
++
++    glAttachShader(prog, vs);
++    glAttachShader(prog, fs);
++    glLinkProgram(prog);
++
++    {
++        GLint ok;
++        glGetProgramiv(prog, GL_LINK_STATUS, &ok);
++        if (!ok) {
++            /* Some drivers return a size of 1 for an empty log.  This is the size
++             * of a log that contains only a terminating NUL character.
++             */
++            GLint size;
++            GLchar *info = NULL;
++            glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
++            if (size > 1) {
++                info = malloc(size);
++                glGetProgramInfoLog(prog, size, NULL, info);
++            }
++
++            av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
++                   (info != NULL) ? info : "<empty log>");
++            return 0;
++        }
++    }
++
++    return prog;
++}
++
++static int
++gl_setup(struct AVFormatContext *const s)
++{
++    const char *vs =
++        "attribute vec4 pos;\n"
++        "varying vec2 texcoord;\n"
++        "\n"
++        "void main() {\n"
++        "  gl_Position = pos;\n"
++        "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
++        "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
++        "}\n";
++    const char *fs =
++        "#extension GL_OES_EGL_image_external : enable\n"
++        "precision mediump float;\n"
++        "uniform samplerExternalOES s;\n"
++        "varying vec2 texcoord;\n"
++        "void main() {\n"
++        "  gl_FragColor = texture2D(s, texcoord);\n"
++        "}\n";
++
++    GLuint vs_s;
++    GLuint fs_s;
++    GLuint prog;
++
++    if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
++        !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
++        !(prog = link_program(s, vs_s, fs_s)))
++        return -1;
++
++    glUseProgram(prog);
++
++    {
++        static const float verts[] = {
++            -1, -1,
++            1, -1,
++            1,  1,
++            -1,  1,
++        };
++        glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
++    }
++
++    glEnableVertexAttribArray(0);
++    return 0;
++}
++
++static int egl_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++    return 0;
++}
++
++static int egl_vout_write_header(AVFormatContext *s)
++{
++    const AVCodecParameters *const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    if (s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++
++static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame)
++{
++    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0];
++    egl_aux_t *da = NULL;
++    unsigned int i;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++
++    for (i = 0; i != 32; ++i) {
++        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
++            da = de->aux + i;
++            break;
++        }
++    }
++
++    if (da == NULL) {
++        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    if (da->texture == 0) {
++        EGLint attribs[50];
++        EGLint *a = attribs;
++        int i, j;
++        static const EGLint anames[] = {
++            EGL_DMA_BUF_PLANE0_FD_EXT,
++            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++            EGL_DMA_BUF_PLANE0_PITCH_EXT,
++            EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
++            EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
++            EGL_DMA_BUF_PLANE1_FD_EXT,
++            EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++            EGL_DMA_BUF_PLANE1_PITCH_EXT,
++            EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
++            EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
++            EGL_DMA_BUF_PLANE2_FD_EXT,
++            EGL_DMA_BUF_PLANE2_OFFSET_EXT,
++            EGL_DMA_BUF_PLANE2_PITCH_EXT,
++            EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
++            EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
++        };
++        const EGLint *b = anames;
++
++        *a++ = EGL_WIDTH;
++        *a++ = av_frame_cropped_width(frame);
++        *a++ = EGL_HEIGHT;
++        *a++ = av_frame_cropped_height(frame);
++        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
++        *a++ = desc->layers[0].format;
++
++        for (i = 0; i < desc->nb_layers; ++i) {
++            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++                const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j;
++                const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index;
++                *a++ = *b++;
++                *a++ = obj->fd;
++                *a++ = *b++;
++                *a++ = p->offset;
++                *a++ = *b++;
++                *a++ = p->pitch;
++                if (obj->format_modifier == 0) {
++                    b += 2;
++                }
++                else {
++                    *a++ = *b++;
++                    *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
++                    *a++ = *b++;
++                    *a++ = (EGLint)(obj->format_modifier >> 32);
++                }
++            }
++        }
++
++        *a = EGL_NONE;
++
++#if TRACE_ALL
++        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
++            av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
++        }
++#endif
++        {
++            const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
++                                                     EGL_NO_CONTEXT,
++                                                     EGL_LINUX_DMA_BUF_EXT,
++                                                     NULL, attribs);
++            if (!image) {
++                av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
++                return -1;
++            }
++
++            glGenTextures(1, &da->texture);
++            glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
++            glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
++
++            eglDestroyImageKHR(de->setup.egl_dpy, image);
++        }
++
++        da->fd = desc->objects[0].fd;
++    }
++
++    glClearColor(0.5, 0.5, 0.5, 0.5);
++    glClear(GL_COLOR_BUFFER_BIT);
++
++    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
++    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
++
++    glDeleteTextures(1, &da->texture);
++    da->texture = 0;
++    da->fd = -1;
++
++    return 0;
++}
++
++static void* display_thread(void *v)
++{
++    AVFormatContext *const s = v;
++    egl_display_env_t *const de = s->priv_data;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++    {
++        EGLint egl_major, egl_minor;
++
++        de->setup.dpy = XOpenDisplay(NULL);
++        if (!de->setup.dpy) {
++            av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
++            goto fail;
++        }
++
++        de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
++        if (!de->setup.egl_dpy) {
++            av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
++            goto fail;
++        }
++
++        if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
++            av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
++            goto fail;
++        }
++
++        av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
++
++        if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
++            av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
++            goto fail;
++        }
++    }
++
++    if (!de->window_width || !de->window_height) {
++        de->window_width = 1280;
++        de->window_height = 720;
++    }
++    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
++                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
++        av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
++        goto fail;
++    }
++
++    if (gl_setup(s)) {
++        av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
++        goto fail;
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
++#endif
++    sem_post(&de->display_start_sem);
++
++    for (;;) {
++        AVFrame *frame;
++
++        while (sem_wait(&de->q_sem) != 0) {
++            av_assert0(errno == EINTR);
++        }
++
++        if (de->q_terminate)
++            break;
++
++        pthread_mutex_lock(&de->q_lock);
++        frame = de->q_next;
++        de->q_next = NULL;
++        pthread_mutex_unlock(&de->q_lock);
++
++        do_display(s, de, frame);
++
++        av_frame_free(&de->q_this);
++        de->q_this = frame;
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++#endif
++
++    return NULL;
++
++fail:
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
++#endif
++    de->q_terminate = 1;
++    sem_post(&de->display_start_sem);
++
++    return NULL;
++}
++
++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    const AVFrame *const src_frame = (AVFrame *)pkt->data;
++    AVFrame *frame;
++    egl_display_env_t *const de = s->priv_data;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++        frame = av_frame_alloc();
++        av_frame_ref(frame, src_frame);
++    }
++    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++        frame = av_frame_alloc();
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        if (av_hwframe_map(frame, src_frame, 0) != 0) {
++            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++            av_frame_free(&frame);
++            return AVERROR(EINVAL);
++        }
++    }
++    else {
++        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++        return AVERROR(EINVAL);
++    }
++
++    // Really hacky sync
++    while (de->show_all && de->q_next) {
++        usleep(3000);
++    }
++
++    pthread_mutex_lock(&de->q_lock);
++    {
++        AVFrame *const t = de->q_next;
++        de->q_next = frame;
++        frame = t;
++    }
++    pthread_mutex_unlock(&de->q_lock);
++
++    if (frame == NULL)
++        sem_post(&de->q_sem);
++    else
++        av_frame_free(&frame);
++
++    return 0;
++}
++
++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                                unsigned flags)
++{
++    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++    return AVERROR_PATCHWELCOME;
++}
++
++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++    switch (type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int egl_vout_init(struct AVFormatContext *s)
++{
++    egl_display_env_t *const de = s->priv_data;
++    unsigned int i;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->setup = (struct egl_setup) { 0 };
++
++    for (i = 0; i != 32; ++i) {
++        de->aux[i].fd = -1;
++    }
++
++    de->q_terminate = 0;
++    pthread_mutex_init(&de->q_lock, NULL);
++    sem_init(&de->q_sem, 0, 0);
++    sem_init(&de->display_start_sem, 0, 0);
++    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
++
++    sem_wait(&de->display_start_sem);
++    if (de->q_terminate) {
++        av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
++        return -1;
++    }
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++    return 0;
++}
++
++static void egl_vout_deinit(struct AVFormatContext *s)
++{
++    egl_display_env_t *const de = s->priv_data;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->q_terminate = 1;
++    sem_post(&de->q_sem);
++    pthread_join(de->q_thread, NULL);
++    sem_destroy(&de->q_sem);
++    pthread_mutex_destroy(&de->q_lock);
++
++    av_frame_free(&de->q_next);
++    av_frame_free(&de->q_this);
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++#define OFFSET(x) offsetof(egl_display_env_t, x)
++static const AVOption options[] = {
++    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { NULL }
++
++};
++
++static const AVClass egl_vout_class = {
++    .class_name = "egl vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++FFOutputFormat ff_vout_egl_muxer = {
++    .p = {
++        .name           = "vout_egl",
++        .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
++        .audio_codec    = AV_CODEC_ID_NONE,
++        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++        .priv_class     = &egl_vout_class,
++    },
++    .priv_data_size = sizeof(egl_display_env_t),
++    .write_header   = egl_vout_write_header,
++    .write_packet   = egl_vout_write_packet,
++    .write_uncoded_frame = egl_vout_write_frame,
++    .write_trailer  = egl_vout_write_trailer,
++    .control_message = egl_vout_control_message,
++    .init           = egl_vout_init,
++    .deinit         = egl_vout_deinit,
++};
++
+diff --git a/libavfilter/Makefile b/libavfilter/Makefile
+index b3d3d981dd..0e7b5856bd 100644
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -262,6 +262,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)                += vf_neighbor.o
+ OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
+ OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_vpp_qsv.o
+ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
+ OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
+ OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
+ OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
+@@ -518,6 +519,7 @@ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
+ OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER)       += vf_transpose_vulkan.o vulkan.o vulkan_filter.o
+ OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
+ OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
++OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
+ OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
+ OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
+                                                 opencl/unsharp.o
+diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
+index b58daa3a3f..1a4cd935f1 100644
+--- a/libavfilter/aarch64/Makefile
++++ b/libavfilter/aarch64/Makefile
+@@ -1,3 +1,5 @@
++OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
+ OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
+ 
++NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_aarch64.o
+ NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+diff --git a/libavfilter/aarch64/vf_bwdif_aarch64.S b/libavfilter/aarch64/vf_bwdif_aarch64.S
+new file mode 100644
+index 0000000000..d6e047dbde
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_aarch64.S
+@@ -0,0 +1,410 @@
++#include "libavutil/aarch64/asm.S"
++
++.macro SQSHRUNN b, s0, s1, s2, s3, n
++        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
++        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
++        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
++        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
++        uzp2            \b\().16b, \s0\().16b, \s1\().16b
++.endm
++
++.macro SMULL4K a0, a1, a2, a3, s0, s1, k
++        smull           \a0\().4s, \s0\().4h, \k
++        smull2          \a1\().4s, \s0\().8h, \k
++        smull           \a2\().4s, \s1\().4h, \k
++        smull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMULL4K a0, a1, a2, a3, s0, s1, k
++        umull           \a0\().4s, \s0\().4h, \k
++        umull2          \a1\().4s, \s0\().8h, \k
++        umull           \a2\().4s, \s1\().4h, \k
++        umull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
++        umlal           \a0\().4s, \s0\().4h, \k
++        umlal2          \a1\().4s, \s0\().8h, \k
++        umlal           \a2\().4s, \s1\().4h, \k
++        umlal2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
++        umlsl           \a0\().4s, \s0\().4h, \k
++        umlsl2          \a1\().4s, \s0\().8h, \k
++        umlsl           \a2\().4s, \s1\().4h, \k
++        umlsl2          \a3\().4s, \s1\().8h, \k
++.endm
++
++
++// void ff_bwdif_filter_line4_aarch64(
++//         void * dst1,         // x0
++//         int d_stride,        // w1
++//         const void * prev1,  // x2
++//         const void * cur1,   // x3
++//         const void * next1,  // x4
++//         int prefs,           // w5
++//         int w,               // w6
++//         int parity,          // w7
++//         int clip_max);       // [sp, #0] (Ignored)
++
++// static const uint16_t coef_lf[2] = { 4309, 213 };
++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
++// static const uint16_t coef_sp[2] = { 5077, 981 };
++
++        .align 16
++
++coeffs:
++        .hword          4309 * 4, 213 * 4                    // lf[1]*4 = v0.h[1]
++        .hword          5570, 3801, 1016, -3801              // hf[0] = v0.h[2], -hf[1] =v0.h[5]
++        .hword          5077, 981
++
++function ff_bwdif_filter_line4_aarch64, export=1
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        cmp             w7, #0
++        csel            x17, x2, x4, ne
++
++        // We want all the V registers - save all the ones we must
++        stp             d14, d15, [sp, #-64]!
++        stp             d8,  d9,  [sp, #48]
++        stp             d10, d11, [sp, #32]
++        stp             d12, d13, [sp, #16]
++
++        ldr             q0, coeffs
++
++        // Some rearrangement of initial values for nice layout of refs
++        mov             w10, w6                         // w10 = loop count
++        neg             w9,  w5                         // w9  = mref
++        lsl             w8,  w9,  #1                    // w8 =  mref2
++        add             w7,  w9,  w9, LSL #1            // w7  = mref3
++        lsl             w6,  w9,  #2                    // w6  = mref4
++        mov             w11, w5                         // w11 = pref
++        lsl             w12, w5,  #1                    // w12 = pref2
++        add             w13, w5,  w5, LSL #1            // w13 = pref3
++        lsl             w14, w5,  #2                    // w14 = pref4
++        add             w15, w5,  w5, LSL #2            // w15 = pref5
++        add             w16, w14, w12                   // w16 = pref6
++
++        lsl             w5,  w1,  #1                    // w5 = d_stride * 2
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
++//             d0  = c0 >> 1;                       // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, SXTW]
++        ldr             q23, [x17, w6, SXTW]
++
++//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, SXTW]
++        ldr             q25, [x17, w14, SXTW]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             j1 = -coef_hf[1] * (c0 + p4);        // j1 = v6-v9  (-c0:v20,v21)
++        add             v20.8h,  v20.8h,  v24.8h
++        add             v21.8h,  v21.8h,  v25.8h
++        SMULL4K         v6, v7, v8, v9, v20, v21, v0.h[5]
++
++//             m3 = cur[mrefs3];                    // m3 = v20
++        ldr             q20, [x3, w7, SXTW]
++
++//             p3 = cur[prefs3];                    // p3 = v21
++        ldr             q21, [x3, w13, SXTW]
++
++//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, SXTW]
++        ldr             q23, [x17, w8, SXTW]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);   // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        ldr             q28, [x3, w16, SXTW]
++        ldr             q25, [x17, w16, SXTW]
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++        ldr             q31, [x3, w12, SXTW]
++        ldr             q27, [x17, w12, SXTW]
++
++//             p6 = prev2[prefs6] + next2[prefs6];  // p6 = v24,v25
++        uaddl           v24.8h,  v25.8b,  v28.8b
++        uaddl2          v25.8h,  v25.16b, v28.16b
++
++//             j1 += coef_hf[2] * (m2 + p6);        // (-p6:v24,v25)
++        add             v24.8h,  v24.8h,  v22.8h
++        add             v25.8h,  v25.8h,  v23.8h
++        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
++
++//             m1 = cur[mrefs];                     // m1 = v24
++        ldr             q24, [x3, w9, SXTW]
++
++//             p5 = cur[prefs5];                    // p5 = v25
++        ldr             q25, [x3, w15, SXTW]
++
++
++//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                       // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             j1 += coef_hf[0] * p2;               // -
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[2]
++
++//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                     // p1 = v22
++        ldr             q22, [x3, w11, SXTW]
++
++//             j1 -= coef_lf[1] * 4 * (m1 + p5);    // -
++        uaddl           v26.8h,  v24.8b,  v25.8b
++        uaddl2          v27.8h,  v24.16b, v25.16b
++        UMLSL4K         v6, v7, v8, v9, v26, v27, v0.h[1]
++
++//             j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1]  * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
++        uaddl           v18.8h,  v22.8b,  v21.8b
++        uaddl2          v19.8h,  v22.16b, v21.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v24.8b,  v25.8b
++        uaddl2          v19.8h,  v24.16b, v25.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v16, v28, v29, v30, v31, 13
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, SXTW]
++        ldr             q29, [x4, w9, SXTW]
++
++//             j1 += coef_lf[0] * 4 * (p1 + p3);    // p1 = v22, p3 = v21
++        uaddl           v26.8h,  v21.8b,  v22.8b
++        uaddl2          v27.8h,  v21.16b, v22.16b
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[0]
++
++        ldr             q30, [x2, w11, SXTW]
++        ldr             q28, [x4, w11, SXTW]
++
++//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             j1 >>= 15;                            // j1 = v3, -v6*, -v7*, -v8*, -v9*
++        SQSHRUNN        v3, v6, v7, v8, v9, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++        ldr             q27, [x2, w13, SXTW]
++        ldr             q26, [x4, w13, SXTW]
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++
++//             }                                   // v28, v30 preserved for next block
++//             {  // tdiff2 = v14
++//                 int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
++//                 int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
++        uabd            v31.16b, v21.16b, v27.16b
++        uabd            v29.16b, v21.16b, v26.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
++        ushr            v19.16b, v14.16b, #1
++        umax            v19.16b, v19.16b, v31.16b
++        umax            v19.16b, v19.16b, v29.16b
++
++//             }
++//             {        // (m2 >> 1) = v13, m1 = v24, d0 = v10, d2 = v15, p1 = v22, diff0 = v18
++//                 int b = (m2 >> 1) - m1;
++//                 int f = d2 - p1;
++//                 int dc = d0 - m1;
++//                 int de = d0 - p1;
++//                 int sp_max = FFMIN(p1 - d0, m1 - d0);
++        uqsub           v31.16b, v22.16b, v10.16b
++        uqsub           v29.16b, v24.16b, v10.16b
++        umin            v29.16b, V31.16b, v29.16b
++
++//                 sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++        uqsub           v30.16b, v24.16b, v13.16b
++        uqsub           v28.16b, v22.16b, v15.16b
++        umax            v28.16b, v28.16b, v30.16b
++        umin            v27.16b, v29.16b, v28.16b
++
++//                 int sp_min = FFMIN(d0 - p1, d0 - m1);
++        uqsub           v31.16b, v10.16b, v22.16b
++        uqsub           v29.16b, v10.16b, v24.16b
++        umin            v29.16b, V31.16b, v29.16b
++
++//                 sp_min = FFMIN(sp_min, FFMAX(b,f));
++        uqsub           v30.16b, v13.16b, v24.16b
++        uqsub           v28.16b, v15.16b, v22.16b
++        umax            v28.16b, v28.16b, v30.16b
++        umin            v26.16b, v29.16b, v28.16b
++
++//                 diff0 = FFMAX3(diff0, sp_min, sp_max);  // diff0 = v18
++        umax            v18.16b, v18.16b, v27.16b
++        umax            v18.16b, v18.16b, v26.16b
++//             }
++//             {        // (p4 >> 1) = v12, p3 = v21, d0 = v10, d2 = v15, p1 = v22, diff2 = v19
++//                 int b = d0 - p1;                  // 1
++//                 int f = (p4 >> 1) - p3;           // [v23]
++//                 int dc = d2 - p1;
++//                 int de = d2 - p3;
++//                 int sp_max = FFMIN(-de, -dc);
++        uqsub           v31.16b, v21.16b, v15.16b
++        uqsub           v29.16b, v22.16b, v15.16b
++        umin            v29.16b, V31.16b, v29.16b
++
++//                 sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++        uqsub           v30.16b, v22.16b, v10.16b
++        uqsub           v28.16b, v21.16b, v12.16b
++        umax            v28.16b, v28.16b, v30.16b
++        umin            v27.16b, v29.16b, v28.16b
++
++//                 int sp_min = FFMIN(de, dc);
++        uqsub           v31.16b, v15.16b, v21.16b
++        uqsub           v29.16b, v15.16b, v22.16b
++        umin            v29.16b, V31.16b, v29.16b
++
++//                 sp_min = FFMIN(sp_min, FFMAX(b,f));
++        uqsub           v30.16b, v10.16b, v22.16b
++        uqsub           v28.16b, v12.16b, v21.16b
++        umax            v28.16b, v28.16b, v30.16b
++        umin            v26.16b, v29.16b, v28.16b
++
++//                 diff2 = FFMAX3(diff2, sp_min, sp_max);
++        umax            v19.16b, v19.16b, v27.16b
++        umax            v19.16b, v19.16b, v26.16b
++
++//             }
++//
++//
++//             {
++//                 int interpol = FFABS(p1 - p3) > temporal_diff2 ? j1:j2;  // interpol = v6 (-j1:v6) (-j2=v16)
++        uabd            v31.16b, v22.16b, v21.16b
++        cmhi            v31.16b, v31.16b, v14.16b
++        bif             v3.16b,  v16.16b, v31.16b
++
++//                 if (interpol > d2 + diff2)
++//                     interpol = d2 + diff2;
++        uqadd           v30.16b, v15.16b, v19.16b
++        umin            v3.16b,  v3.16b,  v30.16b
++
++//                 else if (interpol < d2 - diff2)
++//                     interpol = d2 - diff2;
++        uqsub           v29.16b, v15.16b, v19.16b
++        umax            v3.16b,  v3.16b,  v29.16b
++
++//                 dst[d_stride * 2] = av_clip_uint8(interpol);
++        str             q3,  [x0, w5, SXTW]
++
++//             }
++
++//             dst[d_stride] = p1;
++        str             q22, [x0, w1, SXTW]
++
++
++//             {
++//                 int interpol = FFABS(m1 - p1) > temporal_diff0 ? i1:i2;
++        uabd            v31.16b, v24.16b, v22.16b       // m1 = v24, p1 = v22
++        cmhi            v31.16b, v31.16b, v11.16b       // td0 = v11
++        bif             v2.16b,  v17.16b, v31.16b       // i1 = v2, i2 = v17
++
++//                 if (interpol > d0 + diff0)
++//                     interpol = d0 + diff0;
++        uqadd           v30.16b, v10.16b, v18.16b       // diff0 = v18
++        umin            v2.16b,  v2.16b,  v30.16b
++
++//                 else if (interpol < d0 - diff0)
++//                     interpol = d0 - diff0;
++        uqsub           v29.16b, v10.16b, v18.16b
++        umax            v2.16b,  v2.16b,  v29.16b
++//
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        ldp             d12, d13, [sp, #16]
++        ldp             d10, d11, [sp, #32]
++        ldp             d8,  d9,  [sp, #48]
++        ldp             d14, d15, [sp], #64
++        ret
+diff --git a/libavfilter/aarch64/vf_bwdif_aarch64.h b/libavfilter/aarch64/vf_bwdif_aarch64.h
+new file mode 100644
+index 0000000000..8d97802e5e
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_aarch64.h
+@@ -0,0 +1,8 @@
++#ifndef AVFILTER_AARCH64_VF_BWDIF_H_
++#define AVFILTER_AARCH64_VF_BWDIF_H_
++
++void ff_bwdif_filter_line4_aarch64(void * dst1, int d_stride,
++                          const void * prev1, const void * cur1, const void * next1, int prefs,
++                          int w, int parity, int clip_max);
++
++#endif
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+new file mode 100644
+index 0000000000..c5506424c9
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -0,0 +1,273 @@
++#include "libavutil/common.h"
++#include "libavutil/aarch64/cpu.h"
++#include "../avfilter.h"
++#include "../bwdif.h"
++#include "vf_bwdif_aarch64.h"
++
++/*
++ * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
++ * Used when there is spatial and temporal interpolation.
++ * Filter coefficients coef_sp are used when there is spatial interpolation only.
++ * Adjusted for matching visual sharpness impression of spatial and temporal interpolation.
++ */
++static const uint16_t coef_lf[2] = { 4309, 213 };
++static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
++static const uint16_t coef_sp[2] = { 5077, 981 };
++
++#define NEXT_LINE()\
++    dst += d_stride; \
++    prev += prefs; \
++    cur  += prefs; \
++    next += prefs;
++
++static void filter_line4_check(void *restrict dst1, int d_stride,
++                          const void *restrict prev1, const void *restrict cur1, const void *restrict next1, int prefs,
++                          int w, int parity, int clip_max)
++{
++    uint8_t * restrict dst  = dst1;
++    const uint8_t * restrict prev = prev1;
++    const uint8_t * restrict cur  = cur1;
++    const uint8_t * restrict next = next1;
++
++    const int mrefs = -prefs;
++    const int mrefs2 = mrefs * 2;
++    const int prefs2 = prefs * 2;
++    const int mrefs3 = mrefs * 3;
++    const int prefs3 = prefs * 3;
++    const int mrefs4 = mrefs * 4;
++    const int prefs4 = prefs * 4;
++
++    static int n = 0;
++    uint64_t buf[2048*4/sizeof(uint64_t)];
++    int i, j;
++    static int fail_count = 0;
++
++    memset(dst, 0xba, d_stride * 3);
++    memset(buf, 0xba, d_stride * 3);
++
++    ff_bwdif_filter_line4_aarch64(dst, d_stride, prev, cur, next, prefs, w, parity, clip_max);
++
++    dst  = (uint8_t*)buf;
++    prev = prev1;
++    cur  = cur1;
++    next = next1;
++
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++    NEXT_LINE();
++    memcpy(dst, cur, w);
++    NEXT_LINE();
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++
++    for (j = 0; j != 3; ++j)
++    {
++        const uint8_t * ref = (uint8_t*)buf + j * d_stride;
++        const uint8_t * tst = (uint8_t*)dst1 + j * d_stride;
++        for (i = 0; i != w; ++i)
++        {
++            if (ref[i] != tst[i])
++            {
++                printf("n=%d, (%d,%d): Ref: %02x, Tst: %02x\n", n, i, j, ref[i], tst[i]);
++                if (fail_count++ > 16)
++                    exit(1);
++            }
++        }
++    }
++
++    ++n;
++}
++
++static void __attribute__((optimize("tree-vectorize"))) filter_line4_debug(void *restrict dst1, int d_stride,
++                          const void *restrict prev1, const void *restrict cur1, const void *restrict next1, int prefs,
++                          int w, int parity, int clip_max)
++{
++    uint8_t * restrict dst  = dst1;
++    const uint8_t * restrict prev = prev1;
++    const uint8_t * restrict cur  = cur1;
++    const uint8_t * restrict next = next1;
++
++    const int mrefs = -prefs;
++    const int mrefs2 = mrefs * 2;
++    const int prefs2 = prefs * 2;
++    const int mrefs3 = mrefs * 3;
++    const int prefs3 = prefs * 3;
++    const int mrefs4 = mrefs * 4;
++    const int prefs4 = prefs * 4;
++
++    static int n = 0;
++    static int itt = -1;
++
++    {
++        int x;
++#define prev2 cur
++        const uint8_t * restrict next2 = parity ? prev : next;
++
++        for (x = 0; x < w; x++) {
++            int diff0, diff2;
++            int d0, d2;
++            int temporal_diff0, temporal_diff2;
++
++            int i1, i2;
++            int j1, j2;
++            int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++            if ((x & 15) == 0)
++                ++itt;
++
++//            printf("======= n=%d x=%d [iteration %d.%d] =======\n", n, x, itt, x & 15);
++            c0 = prev2[0] + next2[0];            // c0 = v20,v26
++            d0  = c0 >> 1;                       // d0 = v21
++            temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v9
++//            printf("c0=%d, d0=%d, temporal_diff0=%d\n", c0, d0, temporal_diff0);
++            i1 = coef_hf[0] * c0;                // -
++//            printf("i1=%d\n", i1);
++            m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v3,v4
++            p4 = prev2[prefs4] + next2[prefs4];  // p4 = v5,v6, (p4 >> 1) = v23
++            j1 = -coef_hf[1] * (c0 + p4);        // (-c0:v20,v26*)
++//            printf("m4=%d, p4=%d, j1=%d\n", m4, p4, j1);
++            i1 += coef_hf[2] * (m4 + p4);        // (-m4:v3,v4) (-p4:v5,v6) i1 = v3,v4,v7,v8
++//            printf("hf2 i1=%d\n", i1);
++            m3 = cur[mrefs3];                    // m3 = v5
++            p3 = cur[prefs3];                    // p3 = v10, [f2=v23]
++            i1 -= coef_lf[1] * 4 * (m3 + p3);   // -
++//            printf("lf1 i1=%d\n", i1);
++            m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v11,v12, (m2 >> 1) = v22
++            p6 = prev2[prefs4 + prefs2] + next2[prefs4 + prefs2];  // p6=v0,v1
++            j1 += coef_hf[2] * (m2 + p6);        // (-p6:v0*,v1*), j1 = v13,v14,v15,v16
++//            printf("hf2 j1=%d\n", j1);
++            p2 = prev2[prefs2] + next2[prefs2];  // p2 = v17,v18
++            temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v6
++            j1 += coef_hf[0] * p2;               // -
++            d2  = p2 >> 1;                       // d2 = v19
++            i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v11,v12)
++//            printf("hf1 i1=%d\n", i1);
++            m1 = cur[mrefs];                     // m1 = v11, [b0=v22]
++            p5 = cur[prefs3 + prefs2];           // p5=v2
++            j1 -= coef_lf[1] * 4 * (m1 + p5);    // -
++            p1 = cur[prefs];                     // p1 = v12
++            dst[d_stride] = p1;
++            j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1] * (m1 + p5)) >> 13; // (-p5:v2) j2=v2
++            i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v5) i2=v5
++            {
++                int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++                int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++                diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v24
++//                printf("tdiff0=%d, t1=%d, t2=%d\n", temporal_diff0, t1, t2);
++            }
++            {
++                int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
++                int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
++                diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v25
++//                printf("tdiff2=%d, t1=%d, t2=%d\n", temporal_diff2, t1, t2);
++            }
++            i1 += coef_lf[0] * 4 * (m1 + p1);    // -
++            j1 += coef_lf[0] * 4 * (p1 + p3);    // -
++//            printf("lf0 i1=%d, j1=%d, diff0=%d, diff2=%d\n", i1, j1, diff0, diff2);
++            {
++                int b = (m2 >> 1) - m1;           // [v22]
++                int f = d2 - p1;                  // 1
++                int dc = d0 - m1;
++                int de = d0 - p1;
++                int sp_max = FFMIN(-de, -dc);
++                int sp_min = FFMIN(de, dc);
++                sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++                sp_min = FFMIN(sp_min, FFMAX(b,f));
++//                printf("spmax0=%d, spmin0=%d, b=%d, f=%d, dc=%d, de=%d\n", sp_max, sp_min, b, f, dc, de);
++                diff0 = FFMAX3(diff0, sp_min, sp_max);
++            }
++            {
++                int b = d0 - p1;                  // 1
++                int f = (p4 >> 1) - p3;           // [v23]
++                int dc = d2 - p1;
++                int de = d2 - p3;
++                int sp_max = FFMIN(-de, -dc);
++                int sp_min = FFMIN(de, dc);
++                sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++                sp_min = FFMIN(sp_min, FFMAX(b,f));
++//                printf("spmax2=%d, spmin2=%d, b=%d, f=%d, dc=%d, de=%d\n", sp_max, sp_min, b, f, dc, de);
++                diff2 = FFMAX3(diff2, sp_min, sp_max);
++            }
++
++            i1 >>= 15;
++            j1 >>= 15;
++
++//            printf("Final i1=%d, i2=%d, j1=%d, j2=%d\n", i1, i2, j1, j2);
++
++
++            {
++                int interpol = FFABS(p1 - p3) > temporal_diff2 ? j1:j2;
++
++//                printf("diff2=%d, interpol=%d, d2=%d\n", diff2, interpol, d2);
++
++                if (interpol > d2 + diff2)
++                    interpol = d2 + diff2;
++                else if (interpol < d2 - diff2)
++                    interpol = d2 - diff2;
++                dst[d_stride * 2] = av_clip_uint8(interpol);
++            }
++            {
++                int interpol = FFABS(m1 - p1) > temporal_diff0 ? i1:i2;
++
++//                printf("diff0=%d, interpol=%d, d0=%d\n", diff0, interpol, d0);
++
++                if (interpol > d0 + diff0)
++                    interpol = d0 + diff0;
++                else if (interpol < d0 - diff0)
++                    interpol = d0 - diff0;
++
++                dst[0] = av_clip_uint8(interpol);
++            }
++//            printf("dst[0]=%d, dst[2]=%d\n", dst[0], dst[d_stride*2]);
++
++            dst++;
++            cur++;
++            prev++;
++            next++;
++            next2++;
++//            if (n >= 513 && x >= 719)
++//            {
++//                exit(99);
++//            }
++        }
++#undef prev2
++
++//        NEXT_LINE();
++//        memcpy(dst, cur, w);
++        ++n;
++    }
++}
++
++
++void
++ff_bwdif_init_aarch64(AVFilterContext *ctx)
++{
++    const int cpu_flags = av_get_cpu_flags();
++    BWDIFContext *s = ctx->priv;
++    YADIFContext *yadif = &s->yadif;
++
++    if ((ctx->inputs[0]->w & 31) != 0)
++    {
++        av_log(ctx, AV_LOG_DEBUG, "Cannot use aarch64 optimization: w=%d, (needs multiple of 32)\n", ctx->inputs[0]->w);
++        return;
++    }
++    if (yadif->csp->comp[0].depth != 8)
++    {
++        av_log(ctx, AV_LOG_DEBUG, "Cannot use aarch64 optimization: bits=%d, (only 8 supported)\n", yadif->csp->comp[0].depth);
++        return;
++    }
++
++    if (!have_neon(cpu_flags))
++    {
++        av_log(ctx, AV_LOG_DEBUG, "Cannot use aarch64 optimization: no NEON!\n");
++        return;
++    }
++
++    if (yadif->useasm == 3)
++        s->filter_line4 = filter_line4_check;
++    else if (yadif->useasm == 2)
++        s->filter_line4 = filter_line4_debug;
++    else
++        s->filter_line4 = ff_bwdif_filter_line4_aarch64;
++}
++
+diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
+index d7db46c2af..d504fa1bc8 100644
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -248,6 +248,7 @@ extern const AVFilter ff_vf_derain;
+ extern const AVFilter ff_vf_deshake;
+ extern const AVFilter ff_vf_deshake_opencl;
+ extern const AVFilter ff_vf_despill;
++extern const AVFilter ff_vf_deinterlace_v4l2m2m;
+ extern const AVFilter ff_vf_detelecine;
+ extern const AVFilter ff_vf_dilation;
+ extern const AVFilter ff_vf_dilation_opencl;
+@@ -420,6 +421,7 @@ extern const AVFilter ff_vf_scale;
+ extern const AVFilter ff_vf_scale_cuda;
+ extern const AVFilter ff_vf_scale_npp;
+ extern const AVFilter ff_vf_scale_qsv;
++extern const AVFilter ff_vf_scale_v4l2m2m;
+ extern const AVFilter ff_vf_scale_vaapi;
+ extern const AVFilter ff_vf_scale_vulkan;
+ extern const AVFilter ff_vf_scale2ref;
+@@ -490,6 +492,7 @@ extern const AVFilter ff_vf_trim;
+ extern const AVFilter ff_vf_unpremultiply;
+ extern const AVFilter ff_vf_unsharp;
+ extern const AVFilter ff_vf_unsharp_opencl;
++extern const AVFilter ff_vf_unsand;
+ extern const AVFilter ff_vf_untile;
+ extern const AVFilter ff_vf_uspp;
+ extern const AVFilter ff_vf_v360;
+diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c
+index 306c283f77..d3c82aabf3 100644
+--- a/libavfilter/buffersink.c
++++ b/libavfilter/buffersink.c
+@@ -62,6 +62,11 @@ typedef struct BufferSinkContext {
+     int sample_rates_size;
+ 
+     AVFrame *peeked_frame;
++
++    union {
++        av_buffersink_alloc_video_frame * video;
++    } alloc_cb;
++    void * alloc_v;
+ } BufferSinkContext;
+ 
+ #define NB_ITEMS(list) (list ## _size / sizeof(*list))
+@@ -154,6 +159,44 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx,
+     return get_frame_internal(ctx, frame, 0, nb_samples);
+ }
+ 
++static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h)
++{
++    AVFilterContext * const ctx = link->dst;
++    BufferSinkContext * const bs = ctx->priv;
++    return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) :
++        ff_default_get_video_buffer(link, w, h);
++}
++
++int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v)
++{
++    BufferSinkContext * const bs = ctx->priv;
++    bs->alloc_cb.video = cb;
++    bs->alloc_v = v;
++    return 0;
++}
++
++#if FF_API_BUFFERSINK_ALLOC
++AVBufferSinkParams *av_buffersink_params_alloc(void)
++{
++    static const int pixel_fmts[] = { AV_PIX_FMT_NONE };
++    AVBufferSinkParams *params = av_malloc(sizeof(AVBufferSinkParams));
++    if (!params)
++        return NULL;
++
++    params->pixel_fmts = pixel_fmts;
++    return params;
++}
++
++AVABufferSinkParams *av_abuffersink_params_alloc(void)
++{
++    AVABufferSinkParams *params = av_mallocz(sizeof(AVABufferSinkParams));
++
++    if (!params)
++        return NULL;
++    return params;
++}
++#endif
++
+ static av_cold int common_init(AVFilterContext *ctx)
+ {
+     BufferSinkContext *buf = ctx->priv;
+@@ -381,6 +424,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = {
+     {
+         .name = "default",
+         .type = AVMEDIA_TYPE_VIDEO,
++        .get_buffer = {.video = alloc_video_buffer},
+     },
+ };
+ 
+diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h
+index 64e08de53e..09737d322f 100644
+--- a/libavfilter/buffersink.h
++++ b/libavfilter/buffersink.h
+@@ -166,6 +166,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
+  */
+ int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);
+ 
++typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h);
++int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v);
++
+ /**
+  * @}
+  */
+diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
+index ba17450b93..0dbe5d2335 100644
+--- a/libavfilter/buffersrc.c
++++ b/libavfilter/buffersrc.c
+@@ -201,7 +201,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ 
+         switch (ctx->outputs[0]->type) {
+         case AVMEDIA_TYPE_VIDEO:
+-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
++            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
+                                      frame->format, frame->pts);
+             break;
+         case AVMEDIA_TYPE_AUDIO:
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index 889ff772ed..5ba8006e42 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -35,8 +35,17 @@ typedef struct BWDIFContext {
+     void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat);
++    void (*filter_line4)(void *dst, int dstride,
++                         const void *prev, const void *cur, const void *next, int prefs,
++                         int w, int parity, int clip_max);
+ } BWDIFContext;
+ 
++void ff_bwdif_filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
++                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                          int prefs3, int mrefs3, int prefs4, int mrefs4,
++                          int parity, int clip_max);
++
+ void ff_bwdif_init_x86(BWDIFContext *bwdif);
++void ff_bwdif_init_aarch64(AVFilterContext *ctx);
+ 
+ #endif /* AVFILTER_BWDIF_H */
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index 65c617ebb3..fbc9491642 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -38,6 +38,10 @@
+ #include "video.h"
+ #include "bwdif.h"
+ 
++#include <time.h>
++#define OPT_TEST 0
++#define OPT_NEW  0
++
+ /*
+  * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
+  * Used when there is spatial and temporal interpolation.
+@@ -74,10 +78,10 @@ typedef struct ThreadData {
+         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
+         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
+         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
+- \
++ {/*\
+         if (!diff) { \
+             dst[0] = d; \
+-        } else {
++        } else {*/
+ 
+ #define SPAT_CHECK() \
+             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
+@@ -89,15 +93,16 @@ typedef struct ThreadData {
+             diff = FFMAX3(diff, min, -max);
+ 
+ #define FILTER_LINE() \
++            int i1, i2; \
+             SPAT_CHECK() \
+-            if (FFABS(c - e) > temporal_diff0) { \
+-                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
++            /*if (FFABS(c - e) > temporal_diff0)*/ { \
++                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
+                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
+                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
+                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+-            } else { \
+-                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
+-            }
++            } /*else*/ { \
++                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
++            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
+ 
+ #define FILTER_EDGE() \
+             if (spat) { \
+@@ -111,7 +116,7 @@ typedef struct ThreadData {
+             else if (interpol < d - diff) \
+                 interpol = d - diff; \
+  \
+-            dst[0] = av_clip(interpol, 0, clip_max); \
++            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
+         } \
+  \
+         dst++; \
+@@ -122,7 +127,7 @@ typedef struct ThreadData {
+         next2++; \
+     }
+ 
+-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
++static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+                          int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint8_t *dst = dst1;
+@@ -132,7 +137,101 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+     FILTER_INTRA()
+ }
+ 
+-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++#if OPT_NEW
++void __attribute__((optimize("tree-vectorize"))) ff_bwdif_filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
++                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                          int prefs3, int mrefs3, int prefs4, int mrefs4,
++                          int parity, int clip_max)
++{
++    if (parity) {
++        uint8_t * restrict dst   = dst1;
++        const uint8_t * prev  = prev1;
++        const uint8_t * cur   = cur1;
++        const uint8_t * next  = next1;
++        const uint8_t * prev2 = prev;
++        const uint8_t * next2 = cur;
++        int interpol, x;
++
++        FILTER1()
++        FILTER_LINE()
++        FILTER2()
++    }
++    else {
++        uint8_t * restrict dst   = dst1;
++        const uint8_t * prev  = prev1;
++        const uint8_t * cur   = cur1;
++        const uint8_t * next  = next1;
++        int interpol, x;
++#define prev2 cur
++#define next2 next
++
++        for (x = 0; x < w; x++) {
++            int diff0;
++            int d0;
++            int temporal_diff0;
++
++            int i1, i2;
++            int p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++            m4 = prev2[mrefs4] + next2[mrefs4];  // 2
++            p4 = prev2[prefs4] + next2[prefs4];
++            i1 = coef_hf[2] * (m4 + p4);
++            m3 = cur[mrefs3];                    // 1
++            p3 = cur[prefs3];
++            i1 += -coef_lf[1] * 4 * (m3 + p3);
++            m2 = prev2[mrefs2] + next2[mrefs2];  // 2
++            p2 = prev2[prefs2] + next2[prefs2];  // 2
++            i1 += -coef_hf[1] * (m2 + p2);
++            m1 = cur[mrefs];                     // 1
++            p1 = cur[prefs];                     // 1
++            c0 = prev2[0] + next2[0];            // 2
++            i1 += coef_hf[0] * c0;                // 4
++            d0  = c0 >> 1;                        // 1
++            temporal_diff0 = FFABS(prev2[0] - next2[0]); // 1
++            i1 += coef_lf[0] * 4 * (m1 + p1);    // -
++            {
++                int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++                int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++                diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // 1
++            }
++            {
++                int b = (m2 >> 1) - m1;                  // 1
++                int f = (p2 >> 1) - p1;                  // 1
++                int dc = d0 - m1;
++                int de = d0 - p1;
++                int sp_max = FFMAX(de, dc);
++                int sp_min = FFMIN(de, dc);
++                sp_max = FFMAX(sp_max, FFMIN(b,f));
++                sp_min = FFMIN(sp_min, FFMAX(b,f));
++                diff0 = FFMAX3(diff0, sp_min, -sp_max);
++            }
++
++            i1 >>= 15;
++
++            i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13;
++
++
++            interpol = FFABS(m1 - p1) > temporal_diff0 ? i1:i2;
++
++            if (interpol > d0 + diff0)
++                interpol = d0 + diff0;
++            else if (interpol < d0 - diff0)
++                interpol = d0 - diff0;
++
++            dst[0] = av_clip_uint8(interpol);
++
++            dst++;
++            cur++;
++            prev++;
++            next++;
++#undef prev2
++#undef next2
++        }
++    }
++}
++
++#else
++void __attribute__((optimize("tree-vectorize"))) ff_bwdif_filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                           int prefs3, int mrefs3, int prefs4, int mrefs4,
+                           int parity, int clip_max)
+@@ -149,8 +248,34 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER_LINE()
+     FILTER2()
+ }
++#endif
++
++#define NEXT_LINE()\
++    dst += d_stride; \
++    prev += prefs; \
++    cur  += prefs; \
++    next += prefs;
++
++// ***** Temp
++static void __attribute__((optimize("tree-vectorize"))) filter_line4_c(void *restrict dst1, int d_stride,
++                          const void *restrict prev1, const void *restrict cur1, const void *restrict next1, int prefs,
++                          int w, int parity, int clip_max)
++{
++    uint8_t * restrict dst  = dst1;
++    const uint8_t * restrict prev = prev1;
++    const uint8_t * restrict cur  = cur1;
++    const uint8_t * restrict next = next1;
++
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++    NEXT_LINE();
++    memcpy(dst, cur, w);
++    NEXT_LINE();
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++}
+ 
+-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat)
+ {
+@@ -167,7 +292,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
+-static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
++static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
+                                int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint16_t *dst = dst1;
+@@ -177,7 +302,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
+     FILTER_INTRA()
+ }
+ 
+-static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
+                                 int parity, int clip_max)
+@@ -195,7 +320,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1
+     FILTER2()
+ }
+ 
+-static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
++static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
+                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                               int parity, int clip_max, int spat)
+ {
+@@ -244,6 +369,10 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+                                refs << 1, -(refs << 1),
+                                td->parity ^ td->tff, clip_max,
+                                (y < 2) || ((y + 3) > td->h) ? 0 : 1);
++            } else if (s->filter_line4 && y + 2 < slice_end && ((y + 7) <= td->h)) {
++                s->filter_line4(dst, td->frame->linesize[td->plane], prev, cur, next, refs, td->w,
++                               td->parity ^ td->tff, clip_max);
++                y += 2;
+             } else {
+                 s->filter_line(dst, prev, cur, next, td->w,
+                                refs, -refs, refs << 1, -(refs << 1),
+@@ -258,6 +387,19 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+     return 0;
+ }
+ 
++#if OPT_TEST
++static unsigned int test_frames = 0;
++static uint64_t cum_time = 0;
++static uint64_t min_delta = 99999999;
++static uint64_t max_delta = 0;
++static uint64_t utime(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
++}
++#endif
++
+ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
+                    int parity, int tff)
+ {
+@@ -278,9 +420,23 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
+         td.w     = w;
+         td.h     = h;
+         td.plane = i;
+-
++#if OPT_TEST
++        {
++            const uint64_t now = utime();
++            uint64_t delta;
++            filter_slice(ctx, &td, 0, 1);
++            delta = utime() - now;
++            ++test_frames;
++            cum_time += delta;
++            if (min_delta > delta)
++                min_delta = delta;
++            if (max_delta < delta)
++                max_delta = delta;
++        }
++#else
+         ff_filter_execute(ctx, filter_slice, &td, NULL,
+                           FFMIN(h, ff_filter_get_nb_threads(ctx)));
++#endif
+     }
+     if (yadif->current_field == YADIF_FIELD_END) {
+         yadif->current_field = YADIF_FIELD_NORMAL;
+@@ -297,6 +453,11 @@ static av_cold void uninit(AVFilterContext *ctx)
+     av_frame_free(&yadif->prev);
+     av_frame_free(&yadif->cur );
+     av_frame_free(&yadif->next);
++#if OPT_TEST
++    av_log(ctx, AV_LOG_INFO, "Stats: Avg:%"PRIu64", Max:%"PRIu64", Min:%"PRIu64"\n",
++           test_frames == 0 ? (uint64_t)0 : cum_time / test_frames,
++           max_delta, min_delta);
++#endif
+ }
+ 
+ static const enum AVPixelFormat pix_fmts[] = {
+@@ -340,19 +501,27 @@ static int config_props(AVFilterLink *link)
+ 
+     yadif->csp = av_pix_fmt_desc_get(link->format);
+     yadif->filter = filter;
++    s->filter_line4 = 0;
+     if (yadif->csp->comp[0].depth > 8) {
+         s->filter_intra = filter_intra_16bit;
+         s->filter_line  = filter_line_c_16bit;
+         s->filter_edge  = filter_edge_16bit;
+     } else {
+         s->filter_intra = filter_intra;
+-        s->filter_line  = filter_line_c;
++        s->filter_line  = ff_bwdif_filter_line_c;
+         s->filter_edge  = filter_edge;
++        if (yadif->useasm == 0)
++            s->filter_line4 = filter_line4_c;
+     }
+ 
++    if (yadif->useasm != 0)
++    {
+ #if ARCH_X86
+-    ff_bwdif_init_x86(s);
++        ff_bwdif_init_x86(s);
++#elif ARCH_AARCH64
++        ff_bwdif_init_aarch64(ctx);
+ #endif
++    }
+ 
+     return 0;
+ }
+@@ -377,6 +546,7 @@ static const AVOption bwdif_options[] = {
+     CONST("all",        "deinterlace all frames",                       YADIF_DEINT_ALL,        "deint"),
+     CONST("interlaced", "only deinterlace frames marked as interlaced", YADIF_DEINT_INTERLACED, "deint"),
+ 
++    {"useasm", "use asm functions (default true)", OFFSET(useasm), AV_OPT_TYPE_INT, {.i64=1}, 0, 3, FLAGS, NULL },
+     { NULL }
+ };
+ 
+diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
+new file mode 100644
+index 0000000000..a173a291f8
+--- /dev/null
++++ b/libavfilter/vf_deinterlace_v4l2m2m.c
+@@ -0,0 +1,2102 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * deinterlace video filter - V4L2 M2M
++ */
++
++#include <drm_fourcc.h>
++
++#include <linux/videodev2.h>
++
++#include <dirent.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <stdatomic.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#include "config.h"
++
++#include "libavutil/avassert.h"
++#include "libavutil/avstring.h"
++#include "libavutil/common.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/internal.h"
++#include "libavutil/mathematics.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/time.h"
++
++#define FF_INTERNAL_FIELDS 1
++#include "framequeue.h"
++#include "filters.h"
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "scale_eval.h"
++#include "video.h"
++
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in drm_fourcc.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
++typedef struct V4L2Queue V4L2Queue;
++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
++
++typedef enum filter_type_v4l2_e
++{
++    FILTER_V4L2_DEINTERLACE = 1,
++    FILTER_V4L2_SCALE,
++} filter_type_v4l2_t;
++
++typedef struct V4L2Buffer {
++    int enqueued;
++    int reenqueue;
++    struct v4l2_buffer buffer;
++    AVFrame frame;
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    int num_planes;
++    AVDRMFrameDescriptor drm_frame;
++    V4L2Queue *q;
++} V4L2Buffer;
++
++typedef struct V4L2Queue {
++    struct v4l2_format format;
++    struct v4l2_selection sel;
++    int eos;
++    int num_buffers;
++    V4L2Buffer *buffers;
++    const char * name;
++    DeintV4L2M2MContextShared *ctx;
++} V4L2Queue;
++
++typedef struct pts_stats_s
++{
++    void * logctx;
++    const char * name;  // For debug
++    unsigned int last_count;
++    unsigned int last_interval;
++    int64_t last_pts;
++} pts_stats_t;
++
++#define PTS_TRACK_SIZE 32
++typedef struct pts_track_el_s
++{
++    uint32_t n;
++    unsigned int interval;
++    AVFrame * props;
++} pts_track_el_t;
++
++typedef struct pts_track_s
++{
++    uint32_t n;
++    uint32_t last_n;
++    int got_2;
++    void * logctx;
++    pts_stats_t stats;
++    pts_track_el_t a[PTS_TRACK_SIZE];
++} pts_track_t;
++
++typedef enum drain_state_e
++{
++    DRAIN_NONE = 0,     // Not draining
++    DRAIN_TIMEOUT,      // Drain until normal timeout setup yields no frame
++    DRAIN_LAST,         // Drain with long timeout last_frame in received on output expected
++    DRAIN_EOS,          // Drain with long timeout EOS expected
++    DRAIN_DONE          // Drained
++} drain_state_t;
++
++typedef struct DeintV4L2M2MContextShared {
++    void * logctx;  // For logging - will be NULL when done
++    filter_type_v4l2_t filter_type;
++
++    int fd;
++    int done;   // fd closed - awating all refs dropped
++    int width;
++    int height;
++
++    int drain;          // EOS received (inlink status)
++    drain_state_t drain_state;
++    int64_t drain_pts;  // PTS associated with inline status
++
++    unsigned int frames_rx;
++    unsigned int frames_tx;
++
++    // from options
++    int output_width;
++    int output_height;
++    enum AVPixelFormat output_format;
++
++    int has_enc_stop;
++    // We expect to get exactly the same number of frames out as we put in
++    // We can drain by matching input to output
++    int one_to_one;
++
++    int orig_width;
++    int orig_height;
++    atomic_uint refcount;
++
++    AVBufferRef *hw_frames_ctx;
++
++    unsigned int field_order;
++
++    pts_track_t track;
++
++    V4L2Queue output;
++    V4L2Queue capture;
++} DeintV4L2M2MContextShared;
++
++typedef struct DeintV4L2M2MContext {
++    const AVClass *class;
++
++    DeintV4L2M2MContextShared *shared;
++
++    char * w_expr;
++    char * h_expr;
++    char * output_format_string;;
++
++    int force_original_aspect_ratio;
++    int force_divisible_by;
++
++    char *colour_primaries_string;
++    char *colour_transfer_string;
++    char *colour_matrix_string;
++    int   colour_range;
++    char *chroma_location_string;
++
++    enum AVColorPrimaries colour_primaries;
++    enum AVColorTransferCharacteristic colour_transfer;
++    enum AVColorSpace colour_matrix;
++    enum AVChromaLocation chroma_location;
++} DeintV4L2M2MContext;
++
++
++static inline int drain_frame_expected(const drain_state_t d)
++{
++    return d == DRAIN_EOS || d == DRAIN_LAST;
++}
++
++// These just list the ones we know we can cope with
++static uint32_t
++fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
++{
++    switch (avfmt) {
++    case AV_PIX_FMT_YUV420P:
++        return V4L2_PIX_FMT_YUV420;
++    case AV_PIX_FMT_NV12:
++        return V4L2_PIX_FMT_NV12;
++#if CONFIG_SAND
++    case AV_PIX_FMT_RPI4_8:
++    case AV_PIX_FMT_SAND128:
++        return V4L2_PIX_FMT_NV12_COL128;
++#endif
++    default:
++        break;
++    }
++    return 0;
++}
++
++static enum AVPixelFormat
++fmt_v4l2_to_av(const uint32_t pixfmt)
++{
++    switch (pixfmt) {
++    case V4L2_PIX_FMT_YUV420:
++        return AV_PIX_FMT_YUV420P;
++    case V4L2_PIX_FMT_NV12:
++        return AV_PIX_FMT_NV12;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        return AV_PIX_FMT_RPI4_8;
++#endif
++    default:
++        break;
++    }
++    return AV_PIX_FMT_NONE;
++}
++
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++    return stats->last_interval;
++}
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++        if (stats->last_count < STATS_LAST_COUNT_MAX)
++            ++stats->last_count;
++        return;
++    }
++
++    if (stats->last_pts != AV_NOPTS_VALUE) {
++        const int64_t interval = pts - stats->last_pts;
++
++        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++            stats->last_count >= STATS_LAST_COUNT_MAX) {
++            if (stats->last_interval != 0)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++                       __func__, stats->name, interval, stats->last_count);
++            stats->last_interval = 0;
++        }
++        else {
++            const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++            if (frame_time != stats->last_interval)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++            stats->last_interval = frame_time;
++        }
++    }
++
++    stats->last_pts = pts;
++    stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++    *stats = (pts_stats_t){
++        .logctx = logctx,
++        .name = name,
++        .last_count = 1,
++        .last_interval = 0,
++        .last_pts = AV_NOPTS_VALUE
++    };
++}
++
++static inline uint32_t pts_track_next_n(pts_track_t * const trk)
++{
++    if (++trk->n == 0)
++        trk->n = 1;
++    return trk->n;
++}
++
++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
++{
++    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
++    pts_track_el_t * t;
++
++    // As a first guess assume that n==0 means last frame
++    if (n == 0) {
++        n = trk->last_n;
++        if (n == 0)
++            goto fail;
++    }
++
++    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++    if (t->n != n) {
++        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
++        goto fail;
++    }
++
++    // 1st frame is simple - just believe it
++    if (n != trk->last_n) {
++        trk->last_n = n;
++        trk->got_2 = 0;
++        return av_frame_copy_props(dst, t->props);
++    }
++
++    // Only believe in a single interpolated frame
++    if (trk->got_2)
++        goto fail;
++    trk->got_2 = 1;
++
++    av_frame_copy_props(dst, t->props);
++
++
++    // If we can't guess - don't
++    if (t->interval == 0) {
++        dst->best_effort_timestamp = AV_NOPTS_VALUE;
++        dst->pts = AV_NOPTS_VALUE;
++        dst->pkt_dts = AV_NOPTS_VALUE;
++    }
++    else {
++        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
++            dst->best_effort_timestamp += t->interval / 2;
++        if (dst->pts != AV_NOPTS_VALUE)
++            dst->pts += t->interval / 2;
++        if (dst->pkt_dts != AV_NOPTS_VALUE)
++            dst->pkt_dts += t->interval / 2;
++    }
++
++    return 0;
++
++fail:
++    trk->last_n = 0;
++    trk->got_2 = 0;
++    dst->pts = AV_NOPTS_VALUE;
++    dst->pkt_dts = AV_NOPTS_VALUE;
++    return 0;
++}
++
++// We are only ever expecting in-order frames so nothing more clever is required
++static unsigned int
++pts_track_count(const pts_track_t * const trk)
++{
++    return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1);
++}
++
++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
++{
++    const uint32_t n = pts_track_next_n(trk);
++    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++    pts_stats_add(&trk->stats, src->pts);
++
++    t->n = n;
++    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
++    av_frame_unref(t->props);
++    av_frame_copy_props(t->props, src);
++
++    // We now know what the previous interval was, rather than having to guess,
++    // so set it.  There is a better than decent chance that this is before
++    // we use it.
++    if (t->interval != 0) {
++        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
++        prev_t->interval = t->interval;
++    }
++
++    // In case deinterlace interpolates frames use every other usec
++    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
++}
++
++static void pts_track_uninit(pts_track_t * const trk)
++{
++    unsigned int i;
++    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++        trk->a[i].n = 0;
++        av_frame_free(&trk->a[i].props);
++    }
++}
++
++static int pts_track_init(pts_track_t * const trk, void *logctx)
++{
++    unsigned int i;
++    trk->n = 1;
++    pts_stats_init(&trk->stats, logctx, "track");
++    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++        trk->a[i].n = 0;
++        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
++            pts_track_uninit(trk);
++            return AVERROR(ENOMEM);
++        }
++    }
++    return 0;
++}
++
++static inline uint32_t
++fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline;
++}
++
++static inline uint32_t
++fmt_height(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++}
++
++static inline uint32_t
++fmt_width(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++}
++
++static inline uint32_t
++fmt_pixelformat(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline uint32_t
++buf_bytesused0(const struct v4l2_buffer * const buf)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused;
++}
++
++static void
++init_format(V4L2Queue * const q, const uint32_t format_type)
++{
++    memset(&q->format, 0, sizeof(q->format));
++    memset(&q->sel,    0, sizeof(q->sel));
++    q->format.type = format_type;
++    q->sel.type    = format_type;
++}
++
++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
++{
++    struct v4l2_capability cap;
++    int ret;
++
++    memset(&cap, 0, sizeof(cap));
++    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
++    if (ret < 0)
++        return ret;
++
++    if (ctx->filter_type == FILTER_V4L2_SCALE &&
++        strcmp("bcm2835-codec-isp", cap.card) != 0)
++    {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n");
++        return AVERROR(EINVAL);
++    }
++
++    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n");
++        return AVERROR(EINVAL);
++    }
++
++    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
++        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
++        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
++    }
++    else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
++        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE);
++        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT);
++    }
++    else {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++// Just use for probe - doesn't modify q format
++static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt)
++{
++    struct v4l2_format fmt         = {.type = queue->format.type};
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    int ret, field;
++    // Pick YUV to test with if not otherwise specified
++    uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt);
++    enum AVPixelFormat r_avfmt;
++
++
++    ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt);
++    if (ret)
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
++
++    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type))
++        field = V4L2_FIELD_INTERLACED_TB;
++    else
++        field = V4L2_FIELD_NONE;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        fmt.fmt.pix_mp.pixelformat = pixelformat;
++        fmt.fmt.pix_mp.field = field;
++        fmt.fmt.pix_mp.width = width;
++        fmt.fmt.pix_mp.height = height;
++    } else {
++        fmt.fmt.pix.pixelformat = pixelformat;
++        fmt.fmt.pix.field = field;
++        fmt.fmt.pix.width = width;
++        fmt.fmt.pix.height = height;
++    }
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
++         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
++         fmt.fmt.pix_mp.pixelformat,
++         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
++
++    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt);
++    if (ret)
++        return AVERROR(EINVAL);
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
++         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
++         fmt.fmt.pix_mp.pixelformat,
++         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
++
++    r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt));
++    if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
++        return AVERROR(EINVAL);
++    }
++    if (r_avfmt == AV_PIX_FMT_NONE) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
++        return AVERROR(EINVAL);
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        if (fmt.fmt.pix_mp.field != field) {
++            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
++
++            return AVERROR(EINVAL);
++        }
++    } else {
++        if (fmt.fmt.pix.field != field) {
++            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
++
++            return AVERROR(EINVAL);
++        }
++    }
++
++    return 0;
++}
++
++static int
++do_s_fmt(V4L2Queue * const q)
++{
++    DeintV4L2M2MContextShared * const ctx = q->ctx;
++    const uint32_t pixelformat = fmt_pixelformat(&q->format);
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret));
++        return ret;
++    }
++
++    if (pixelformat != fmt_pixelformat(&q->format)) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format)));
++        return AVERROR(EINVAL);
++    }
++
++    q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
++    q->sel.flags  = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE;
++
++    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret));
++    }
++
++    return 0;
++}
++
++static void
++set_fmt_color(struct v4l2_format *const fmt,
++               const enum AVColorPrimaries avcp,
++               const enum AVColorSpace avcs,
++               const enum AVColorTransferCharacteristic avxc)
++{
++    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
++    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
++    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
++
++    switch (avcp) {
++    case AVCOL_PRI_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        ycbcr = V4L2_YCBCR_ENC_709;
++        break;
++    case AVCOL_PRI_BT470M:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        ycbcr = V4L2_YCBCR_ENC_601;
++        break;
++    case AVCOL_PRI_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_PRI_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_PRI_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_PRI_BT2020:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    case AVCOL_PRI_SMPTE428:
++    case AVCOL_PRI_SMPTE431:
++    case AVCOL_PRI_SMPTE432:
++    case AVCOL_PRI_EBU3213:
++    case AVCOL_PRI_RESERVED:
++    case AVCOL_PRI_FILM:
++    case AVCOL_PRI_UNSPECIFIED:
++    default:
++        break;
++    }
++
++    switch (avcs) {
++    case AVCOL_SPC_RGB:
++        cs = V4L2_COLORSPACE_SRGB;
++        break;
++    case AVCOL_SPC_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        break;
++    case AVCOL_SPC_FCC:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        break;
++    case AVCOL_SPC_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_SPC_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_SPC_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_SPC_BT2020_CL:
++        cs = V4L2_COLORSPACE_BT2020;
++        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
++        break;
++    case AVCOL_SPC_BT2020_NCL:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    default:
++        break;
++    }
++
++    switch (xfer) {
++    case AVCOL_TRC_BT709:
++        xfer = V4L2_XFER_FUNC_709;
++        break;
++    case AVCOL_TRC_IEC61966_2_1:
++        xfer = V4L2_XFER_FUNC_SRGB;
++        break;
++    case AVCOL_TRC_SMPTE240M:
++        xfer = V4L2_XFER_FUNC_SMPTE240M;
++        break;
++    case AVCOL_TRC_SMPTE2084:
++        xfer = V4L2_XFER_FUNC_SMPTE2084;
++        break;
++    default:
++        break;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.colorspace = cs;
++        fmt->fmt.pix_mp.ycbcr_enc = ycbcr;
++        fmt->fmt.pix_mp.xfer_func = xfer;
++    } else {
++        fmt->fmt.pix.colorspace = cs;
++        fmt->fmt.pix.ycbcr_enc = ycbcr;
++        fmt->fmt.pix.xfer_func = xfer;
++    }
++}
++
++static void
++set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr)
++{
++    const enum v4l2_quantization q =
++        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
++        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
++            V4L2_QUANTIZATION_DEFAULT;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.quantization = q;
++    } else {
++        fmt->fmt.pix.quantization = q;
++    }
++}
++
++static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt)
++{
++    enum v4l2_ycbcr_encoding ycbcr;
++    enum v4l2_colorspace cs;
++
++    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.colorspace :
++        fmt->fmt.pix.colorspace;
++
++    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.ycbcr_enc:
++        fmt->fmt.pix.ycbcr_enc;
++
++    switch(ycbcr) {
++    case V4L2_YCBCR_ENC_XV709:
++    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
++    case V4L2_YCBCR_ENC_XV601:
++    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
++    default:
++        break;
++    }
++
++    switch(cs) {
++    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
++    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
++    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
++    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
++    default:
++        break;
++    }
++
++    return AVCOL_PRI_UNSPECIFIED;
++}
++
++static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt)
++{
++    enum v4l2_ycbcr_encoding ycbcr;
++    enum v4l2_colorspace cs;
++
++    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.colorspace :
++        fmt->fmt.pix.colorspace;
++
++    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.ycbcr_enc:
++        fmt->fmt.pix.ycbcr_enc;
++
++    switch(cs) {
++    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
++    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
++    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
++    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
++    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
++    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
++    case V4L2_COLORSPACE_BT2020:
++        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
++            return AVCOL_SPC_BT2020_CL;
++        else
++             return AVCOL_SPC_BT2020_NCL;
++    default:
++        break;
++    }
++
++    return AVCOL_SPC_UNSPECIFIED;
++}
++
++static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt)
++{
++    enum v4l2_ycbcr_encoding ycbcr;
++    enum v4l2_xfer_func xfer;
++    enum v4l2_colorspace cs;
++
++    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.colorspace :
++        fmt->fmt.pix.colorspace;
++
++    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.ycbcr_enc:
++        fmt->fmt.pix.ycbcr_enc;
++
++    xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.xfer_func:
++        fmt->fmt.pix.xfer_func;
++
++    switch (xfer) {
++    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
++    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
++    default:
++        break;
++    }
++
++    switch (cs) {
++    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
++    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
++    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
++    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
++    default:
++        break;
++    }
++
++    switch (ycbcr) {
++    case V4L2_YCBCR_ENC_XV709:
++    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
++    default:
++        break;
++    }
++
++    return AVCOL_TRC_UNSPECIFIED;
++}
++
++static enum AVColorRange get_color_range(const struct v4l2_format *const fmt)
++{
++    enum v4l2_quantization qt;
++
++    qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.quantization :
++        fmt->fmt.pix.quantization;
++
++    switch (qt) {
++    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
++    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
++    default:
++        break;
++    }
++
++     return AVCOL_RANGE_UNSPECIFIED;
++}
++
++static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
++{
++    struct v4l2_format *const format = &q->format;
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++#if CONFIG_SAND
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++#endif
++            break;
++
++        case DRM_FORMAT_P030:
++#if CONFIG_SAND
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++#endif
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc);
++    set_fmt_color_range(format, frame->color_range);
++
++    q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right);
++    q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom);
++    q->sel.r.left = frame->crop_left;
++    q->sel.r.top = frame->crop_top;
++
++    return 0;
++}
++
++
++static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height)
++{
++    struct v4l2_format * const fmt   = &queue->format;
++    struct v4l2_selection *const sel = &queue->sel;
++
++    memset(&fmt->fmt, 0, sizeof(fmt->fmt));
++
++    // Align w/h to 16 here in case there are alignment requirements at the next
++    // stage of the filter chain (also RPi deinterlace setup is bust and this
++    // fixes it)
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.pixelformat = pixelformat;
++        fmt->fmt.pix_mp.field = field;
++        fmt->fmt.pix_mp.width = FFALIGN(width, 16);
++        fmt->fmt.pix_mp.height = FFALIGN(height, 16);
++    } else {
++        fmt->fmt.pix.pixelformat = pixelformat;
++        fmt->fmt.pix.field = field;
++        fmt->fmt.pix.width = FFALIGN(width, 16);
++        fmt->fmt.pix.height = FFALIGN(height, 16);
++    }
++
++    set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer);
++    set_fmt_color_range(fmt, priv->colour_range);
++
++    sel->r.width = width;
++    sel->r.height = height;
++    sel->r.left = 0;
++    sel->r.top = 0;
++
++    return do_s_fmt(queue);
++}
++
++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
++{
++    int ret;
++
++    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
++    if (ctx->fd < 0)
++        return AVERROR(errno);
++
++    ret = deint_v4l2m2m_prepare_context(ctx);
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n");
++        goto fail;
++    }
++
++    ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format);
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n");
++        goto fail;
++    }
++
++    ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE);
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n");
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    close(ctx->fd);
++    ctx->fd = -1;
++
++    return ret;
++}
++
++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
++{
++    int ret = AVERROR(EINVAL);
++    struct dirent *entry;
++    char node[PATH_MAX];
++    DIR *dirp;
++
++    dirp = opendir("/dev");
++    if (!dirp)
++        return AVERROR(errno);
++
++    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
++
++        if (strncmp(entry->d_name, "video", 5))
++            continue;
++
++        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
++        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
++        ret = deint_v4l2m2m_probe_device(ctx, node);
++        if (!ret)
++            break;
++    }
++
++    closedir(dirp);
++
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
++        ctx->fd = -1;
++
++        return ret;
++    }
++
++    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
++
++    return 0;
++}
++
++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
++{
++    int ret;
++
++    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
++    if (ret < 0)
++        return AVERROR(errno);
++
++    buf->enqueued = 1;
++
++    return 0;
++}
++
++static void
++drm_frame_init(AVDRMFrameDescriptor * const d)
++{
++    unsigned int i;
++    for (i = 0; i != AV_DRM_MAX_PLANES; ++i) {
++        d->objects[i].fd = -1;
++    }
++}
++
++static void
++drm_frame_uninit(AVDRMFrameDescriptor * const d)
++{
++    unsigned int i;
++    for (i = 0; i != d->nb_objects; ++i) {
++        if (d->objects[i].fd != -1) {
++            close(d->objects[i].fd);
++            d->objects[i].fd = -1;
++        }
++    }
++}
++
++static void
++avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n)
++{
++    unsigned int i;
++    V4L2Buffer* const avbufs = *ppavbufs;
++
++    if (avbufs == NULL)
++        return;
++    *ppavbufs = NULL;
++
++    for (i = 0; i != n; ++i) {
++        V4L2Buffer* const avbuf = avbufs + i;
++        drm_frame_uninit(&avbuf->drm_frame);
++    }
++
++    av_free(avbufs);
++}
++
++static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
++{
++    struct v4l2_exportbuffer expbuf;
++    int i, ret;
++    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
++
++    AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame;
++    AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
++    const struct v4l2_format *const fmt = &q->format;
++    const uint32_t height = fmt_height(fmt);
++    ptrdiff_t bpl0;
++
++    /* fill the DRM frame descriptor */
++    drm_desc->nb_layers = 1;
++    layer->nb_planes = avbuf->num_planes;
++
++    for (int i = 0; i < avbuf->num_planes; i++) {
++        layer->planes[i].object_index = i;
++        layer->planes[i].offset = 0;
++        layer->planes[i].pitch = fmt_bpl(fmt, i);
++    }
++    bpl0 = layer->planes[0].pitch;
++
++    switch (fmt_pixelformat(fmt)) {
++#if CONFIG_SAND
++        case V4L2_PIX_FMT_NV12_COL128:
++            mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
++            layer->format = V4L2_PIX_FMT_NV12;
++
++            if (avbuf->num_planes > 1)
++                break;
++
++            layer->nb_planes = 2;
++            layer->planes[1].object_index = 0;
++            layer->planes[1].offset = height * 128;
++            layer->planes[0].pitch = fmt_width(fmt);
++            layer->planes[1].pitch = layer->planes[0].pitch;
++            break;
++#endif
++
++        case DRM_FORMAT_NV12:
++            layer->format = V4L2_PIX_FMT_NV12;
++
++            if (avbuf->num_planes > 1)
++                break;
++
++            layer->nb_planes = 2;
++            layer->planes[1].object_index = 0;
++            layer->planes[1].offset = bpl0 * height;
++            layer->planes[1].pitch = bpl0;
++            break;
++
++        case V4L2_PIX_FMT_YUV420:
++            layer->format = DRM_FORMAT_YUV420;
++
++            if (avbuf->num_planes > 1)
++                break;
++
++            layer->nb_planes = 3;
++            layer->planes[1].object_index = 0;
++            layer->planes[1].offset = bpl0 * height;
++            layer->planes[1].pitch = bpl0 / 2;
++            layer->planes[2].object_index = 0;
++            layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4);
++            layer->planes[2].pitch = bpl0 / 2;
++            break;
++
++        default:
++            drm_desc->nb_layers = 0;
++            return AVERROR(EINVAL);
++    }
++
++    drm_desc->nb_objects = 0;
++    for (i = 0; i < avbuf->num_planes; i++) {
++        memset(&expbuf, 0, sizeof(expbuf));
++
++        expbuf.index = avbuf->buffer.index;
++        expbuf.type = avbuf->buffer.type;
++        expbuf.plane = i;
++
++        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
++        if (ret < 0)
++            return AVERROR(errno);
++
++        drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ?
++            avbuf->buffer.m.planes[i].length : avbuf->buffer.length;
++        drm_desc->objects[i].fd = expbuf.fd;
++        drm_desc->objects[i].format_modifier = mod;
++        drm_desc->nb_objects = i + 1;
++    }
++
++    return 0;
++}
++
++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
++{
++    struct v4l2_format *fmt = &queue->format;
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    struct v4l2_requestbuffers req;
++    int ret, i, multiplanar;
++    uint32_t memory;
++
++    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
++        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
++
++    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
++
++    memset(&req, 0, sizeof(req));
++    req.count = queue->num_buffers;
++    req.memory = memory;
++    req.type = fmt->type;
++
++    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
++    if (ret < 0) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
++
++        return AVERROR(errno);
++    }
++
++    queue->num_buffers = req.count;
++    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
++    if (!queue->buffers) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
++
++        return AVERROR(ENOMEM);
++    }
++
++    for (i = 0; i < queue->num_buffers; i++) {
++        V4L2Buffer * const buf = &queue->buffers[i];
++
++        buf->enqueued = 0;
++        buf->q = queue;
++
++        buf->buffer.type = fmt->type;
++        buf->buffer.memory = memory;
++        buf->buffer.index = i;
++
++        if (multiplanar) {
++            buf->buffer.length = VIDEO_MAX_PLANES;
++            buf->buffer.m.planes = buf->planes;
++        }
++
++        drm_frame_init(&buf->drm_frame);
++    }
++
++    for (i = 0; i < queue->num_buffers; i++) {
++        V4L2Buffer * const buf = &queue->buffers[i];
++
++        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
++        if (ret < 0) {
++            ret = AVERROR(errno);
++
++            goto fail;
++        }
++
++        buf->num_planes = multiplanar ? buf->buffer.length : 1;
++
++        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
++            ret = deint_v4l2m2m_enqueue_buffer(buf);
++            if (ret)
++                goto fail;
++
++            ret = v4l2_buffer_export_drm(queue, buf);
++            if (ret)
++                goto fail;
++        }
++    }
++
++    return 0;
++
++fail:
++    avbufs_delete(&queue->buffers, queue->num_buffers);
++    queue->num_buffers = 0;
++    return ret;
++}
++
++static int deint_v4l2m2m_streamon(V4L2Queue *queue)
++{
++    DeintV4L2M2MContextShared * const ctx = queue->ctx;
++    int type = queue->format.type;
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++    if (ret < 0)
++        return AVERROR(errno);
++
++    return 0;
++}
++
++static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
++{
++    DeintV4L2M2MContextShared * const ctx = queue->ctx;
++    int type = queue->format.type;
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++    if (ret < 0)
++        return AVERROR(errno);
++
++    return 0;
++}
++
++// timeout in ms
++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
++{
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    struct v4l2_buffer buf = { 0 };
++    V4L2Buffer* avbuf = NULL;
++    struct pollfd pfd;
++    short events;
++    int ret;
++
++    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++        events =  POLLOUT | POLLWRNORM;
++    else
++        events = POLLIN | POLLRDNORM;
++
++    pfd.events = events;
++    pfd.fd = ctx->fd;
++
++    for (;;) {
++        ret = poll(&pfd, 1, timeout);
++        if (ret > 0)
++            break;
++        if (errno == EINTR)
++            continue;
++        return NULL;
++    }
++
++    if (pfd.revents & POLLERR)
++        return NULL;
++
++    if (pfd.revents & events) {
++        memset(&buf, 0, sizeof(buf));
++        buf.memory = V4L2_MEMORY_MMAP;
++        buf.type = queue->format.type;
++        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++            memset(planes, 0, sizeof(planes));
++            buf.length = VIDEO_MAX_PLANES;
++            buf.m.planes = planes;
++        }
++
++        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
++        if (ret) {
++            if (errno != EAGAIN)
++                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
++                       av_err2str(AVERROR(errno)));
++            return NULL;
++        }
++
++        avbuf = &queue->buffers[buf.index];
++        avbuf->enqueued = 0;
++        avbuf->buffer = buf;
++        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++            memcpy(avbuf->planes, planes, sizeof(planes));
++            avbuf->buffer.m.planes = avbuf->planes;
++        }
++        return avbuf;
++    }
++
++    return NULL;
++}
++
++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
++{
++    int i;
++    V4L2Buffer *buf = NULL;
++
++    for (i = 0; i < queue->num_buffers; i++)
++        if (!queue->buffers[i].enqueued) {
++            buf = &queue->buffers[i];
++            break;
++        }
++    return buf;
++}
++
++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
++{
++    int i;
++    V4L2Buffer *buf = NULL;
++
++    if (!queue || !queue->buffers)
++        return;
++    for (i = 0; i < queue->num_buffers; i++) {
++        buf = &queue->buffers[i];
++        if (queue->buffers[i].enqueued)
++            av_frame_unref(&buf->frame);
++    }
++}
++
++static void recycle_q(V4L2Queue * const queue)
++{
++    V4L2Buffer* avbuf;
++    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
++        av_frame_unref(&avbuf->frame);
++    }
++}
++
++static int count_enqueued(V4L2Queue *queue)
++{
++    int i;
++    int n = 0;
++
++    if (queue->buffers == NULL)
++        return 0;
++
++    for (i = 0; i < queue->num_buffers; i++)
++        if (queue->buffers[i].enqueued)
++            ++n;
++    return n;
++}
++
++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
++{
++    DeintV4L2M2MContextShared *const ctx = queue->ctx;
++    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
++    V4L2Buffer *buf;
++    int i;
++
++    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++        recycle_q(queue);
++
++    buf = deint_v4l2m2m_find_free_buf(queue);
++    if (!buf) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
++        return AVERROR(EAGAIN);
++    }
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
++        for (i = 0; i < drm_desc->nb_objects; i++)
++            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
++    else
++        buf->buffer.m.fd = drm_desc->objects[0].fd;
++
++    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
++        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
++            V4L2_FIELD_INTERLACED_BT;
++
++    if (ctx->field_order != buf->buffer.field) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
++        ctx->field_order = buf->buffer.field;
++    }
++
++    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
++
++    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
++
++    av_frame_move_ref(&buf->frame, frame);
++
++    return deint_v4l2m2m_enqueue_buffer(buf);
++}
++
++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
++{
++    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
++        V4L2Queue *capture = &ctx->capture;
++        V4L2Queue *output  = &ctx->output;
++
++        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
++
++        if (ctx->fd >= 0) {
++            deint_v4l2m2m_streamoff(capture);
++            deint_v4l2m2m_streamoff(output);
++        }
++
++        avbufs_delete(&capture->buffers, capture->num_buffers);
++
++        deint_v4l2m2m_unref_queued(output);
++
++        av_buffer_unref(&ctx->hw_frames_ctx);
++
++        if (capture->buffers)
++            av_free(capture->buffers);
++
++        if (output->buffers)
++            av_free(output->buffers);
++
++        if (ctx->fd >= 0) {
++            close(ctx->fd);
++            ctx->fd = -1;
++        }
++
++        av_free(ctx);
++    }
++}
++
++static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++{
++    V4L2Buffer *buf                = opaque;
++    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
++
++    if (!ctx->done)
++        deint_v4l2m2m_enqueue_buffer(buf);
++
++    deint_v4l2m2m_destroy_context(ctx);
++}
++
++// timeout in ms
++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
++{
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    V4L2Buffer* avbuf;
++    enum AVColorPrimaries color_primaries;
++    enum AVColorSpace colorspace;
++    enum AVColorTransferCharacteristic color_trc;
++    enum AVColorRange color_range;
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    if (queue->eos) {
++        av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__);
++        return AVERROR_EOF;
++    }
++
++    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
++    if (!avbuf) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
++        return AVERROR(EAGAIN);
++    }
++
++    if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) {
++        if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0)
++            queue->eos = 1;
++        if (buf_bytesused0(&avbuf->buffer) == 0)
++            return queue->eos ? AVERROR_EOF : AVERROR(EINVAL);
++    }
++
++    // Fill in PTS and anciliary info from src frame
++    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
++
++    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
++                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
++                            avbuf, AV_BUFFER_FLAG_READONLY);
++    if (!frame->buf[0]) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
++        return AVERROR(ENOMEM);
++    }
++
++    atomic_fetch_add(&ctx->refcount, 1);
++
++    frame->data[0] = (uint8_t *)&avbuf->drm_frame;
++    frame->format = AV_PIX_FMT_DRM_PRIME;
++    if (ctx->hw_frames_ctx)
++        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
++    frame->height = ctx->output_height;
++    frame->width = ctx->output_width;
++
++    color_primaries = get_color_primaries(&ctx->capture.format);
++    colorspace      = get_color_space(&ctx->capture.format);
++    color_trc       = get_color_trc(&ctx->capture.format);
++    color_range     = get_color_range(&ctx->capture.format);
++
++    // If the color parameters are unspecified by V4L2 then leave alone as they
++    // will have been copied from src
++    if (color_primaries != AVCOL_PRI_UNSPECIFIED)
++        frame->color_primaries = color_primaries;
++    if (colorspace != AVCOL_SPC_UNSPECIFIED)
++        frame->colorspace = colorspace;
++    if (color_trc != AVCOL_TRC_UNSPECIFIED)
++        frame->color_trc = color_trc;
++    if (color_range != AVCOL_RANGE_UNSPECIFIED)
++        frame->color_range = color_range;
++
++    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) {
++        // Not interlaced now
++        frame->interlaced_frame = 0;   // *** Fill in from dst buffer?
++        frame->top_field_first = 0;
++        // Pkt duration halved
++        frame->pkt_duration /= 2;
++    }
++
++    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
++        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
++    }
++
++    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
++    return 0;
++}
++
++static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
++{
++    AVFilterLink *inlink           = outlink->src->inputs[0];
++    AVFilterContext *avctx         = outlink->src;
++    DeintV4L2M2MContext *priv      = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++    int ret;
++
++    ctx->height = avctx->inputs[0]->h;
++    ctx->width = avctx->inputs[0]->w;
++
++    if (ctx->filter_type == FILTER_V4L2_SCALE) {
++        if ((ret = ff_scale_eval_dimensions(priv,
++                                            priv->w_expr, priv->h_expr,
++                                            inlink, outlink,
++                                            &ctx->output_width, &ctx->output_height)) < 0)
++            return ret;
++
++        ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height,
++                                   priv->force_original_aspect_ratio, priv->force_divisible_by);
++    }
++    else {
++        ctx->output_width  = ctx->width;
++        ctx->output_height = ctx->height;
++    }
++
++    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__,
++           ctx->width, ctx->height, ctx->output_width, ctx->output_height,
++           inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den);
++
++    outlink->time_base           = inlink->time_base;
++    outlink->w                   = ctx->output_width;
++    outlink->h                   = ctx->output_height;
++    outlink->format              = inlink->format;
++    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0)
++        outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den};
++
++    if (inlink->sample_aspect_ratio.num)
++        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
++    else
++        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
++
++    ret = deint_v4l2m2m_find_device(ctx);
++    if (ret)
++        return ret;
++
++    if (inlink->hw_frames_ctx) {
++        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
++        if (!ctx->hw_frames_ctx)
++            return AVERROR(ENOMEM);
++    }
++    return 0;
++}
++
++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
++{
++    const uint64_t mod = drm_desc->objects[0].format_modifier;
++    const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID);
++
++    // Only currently support single object things
++    if (drm_desc->nb_objects != 1)
++        return 0;
++
++    switch (drm_desc->layers[0].format) {
++    case DRM_FORMAT_YUV420:
++        return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
++    case DRM_FORMAT_NV12:
++        return is_linear ? V4L2_PIX_FMT_NV12 :
++#if CONFIG_SAND
++            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 :
++#endif
++            0;
++    default:
++        break;
++    }
++    return 0;
++}
++
++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterContext *avctx         = link->dst;
++    DeintV4L2M2MContext *priv      = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++    V4L2Queue *capture             = &ctx->capture;
++    V4L2Queue *output              = &ctx->output;
++    int ret;
++
++    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n",
++           __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
++    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
++           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
++
++    if (ctx->field_order == V4L2_FIELD_ANY) {
++        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
++        uint32_t pixelformat = desc_pixelformat(drm_desc);
++
++        if (pixelformat == 0) {
++            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
++                   av_fourcc2str(drm_desc->layers[0].format),
++                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
++            return AVERROR(EINVAL);
++        }
++
++        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
++        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
++
++        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
++           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
++
++        if ((ret = set_src_fmt(output, in)) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n",
++                   av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier);
++            return ret;
++        }
++
++        ret = do_s_fmt(output);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n");
++            return ret;
++        }
++
++        if (ctx->output_format != AV_PIX_FMT_NONE)
++           pixelformat = fmt_av_to_v4l2(ctx->output_format);
++        ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n");
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_allocate_buffers(capture);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n");
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_streamon(capture);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret));
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_allocate_buffers(output);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n");
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_streamon(output);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret));
++            return ret;
++        }
++
++        if (in->top_field_first)
++            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
++        else
++            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
++
++        {
++            struct v4l2_encoder_cmd ecmd = {
++                .cmd = V4L2_ENC_CMD_STOP
++            };
++            ctx->has_enc_stop = 0;
++            if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) {
++                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n");
++                ctx->has_enc_stop = 1;
++            }
++            else {
++                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno)));
++            }
++
++        }
++    }
++
++    ret = deint_v4l2m2m_enqueue_frame(output, in);
++
++    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
++    return ret;
++}
++
++static int
++ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s,
++           AVFilterLink * const inlink)
++{
++    int instatus;
++    int64_t inpts;
++
++    if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0)
++        return 0;
++
++    s->drain      = instatus;
++    s->drain_pts  = inpts;
++    s->drain_state = DRAIN_TIMEOUT;
++
++    if (s->field_order == V4L2_FIELD_ANY) {  // Not yet started
++        s->drain_state = DRAIN_DONE;
++    }
++    else if (s->one_to_one) {
++        s->drain_state = DRAIN_LAST;
++    }
++    else if (s->has_enc_stop) {
++        struct v4l2_encoder_cmd ecmd = {
++            .cmd = V4L2_ENC_CMD_STOP
++        };
++        if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) {
++            av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n");
++            s->drain_state = DRAIN_EOS;
++        }
++        else {
++            av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno)));
++        }
++    }
++    return 1;
++}
++
++static int deint_v4l2m2m_activate(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext * const priv = avctx->priv;
++    DeintV4L2M2MContextShared *const s = priv->shared;
++    AVFilterLink * const outlink = avctx->outputs[0];
++    AVFilterLink * const inlink = avctx->inputs[0];
++    int n = 0;
++    int cn = 99;
++    int did_something = 0;
++
++    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
++
++    ack_inlink(avctx, s, inlink);
++
++    if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
++    {
++        AVFrame * frame = av_frame_alloc();
++        int rv;
++
++        recycle_q(&s->output);
++        n = count_enqueued(&s->output);
++
++        if (frame == NULL) {
++            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
++            return AVERROR(ENOMEM);
++        }
++
++        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame,
++                                         drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0);
++        if (rv != 0) {
++            av_frame_free(&frame);
++            if (rv == AVERROR_EOF) {
++                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__);
++                s->drain_state = DRAIN_DONE;
++            }
++            else if (rv == AVERROR(EAGAIN)) {
++                if (s->drain_state != DRAIN_NONE) {
++                    av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__);
++                    s->drain_state = DRAIN_DONE;
++                }
++            }
++            else {
++                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
++                return rv;
++            }
++        }
++        else {
++            frame->interlaced_frame = 0;
++            // frame is always consumed by filter_frame - even on error despite
++            // a somewhat confusing comment in the header
++            rv = ff_filter_frame(outlink, frame);
++            ++s->frames_tx;
++
++            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
++            did_something = 1;
++
++            if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) {
++                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__);
++                s->drain_state = DRAIN_DONE;
++            }
++        }
++
++        cn = count_enqueued(&s->capture);
++    }
++
++    if (s->drain_state == DRAIN_DONE) {
++        ff_outlink_set_status(outlink, s->drain, s->drain_pts);
++        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain));
++        return 0;
++    }
++
++    recycle_q(&s->output);
++    n = count_enqueued(&s->output);
++
++    while (n < 6 && !s->drain) {
++        AVFrame * frame;
++        int rv;
++
++        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
++            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
++            return rv;
++        }
++
++        if (frame == NULL) {
++            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
++            if (!ack_inlink(avctx, s, inlink)) {
++                ff_inlink_request_frame(inlink);
++                av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
++            }
++            break;
++        }
++        ++s->frames_rx;
++
++        rv = deint_v4l2m2m_filter_frame(inlink, frame);
++        av_frame_free(&frame);
++
++        if (rv != 0)
++            return rv;
++
++        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
++        did_something = 1;
++        ++n;
++    }
++
++    if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) {
++        ff_filter_set_ready(avctx, 1);
++        did_something = 1;
++        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
++    }
++
++    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
++    return did_something ? 0 : FFERROR_NOT_READY;
++}
++
++static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type)
++{
++    DeintV4L2M2MContext * const priv = avctx->priv;
++    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
++
++    if (!ctx) {
++        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
++        return AVERROR(ENOMEM);
++    }
++    priv->shared = ctx;
++    ctx->logctx = priv;
++    ctx->filter_type = filter_type;
++    ctx->fd = -1;
++    ctx->output.ctx = ctx;
++    ctx->output.num_buffers = 8;
++    ctx->output.name = "OUTPUT";
++    ctx->capture.ctx = ctx;
++    ctx->capture.num_buffers = 12;
++    ctx->capture.name = "CAPTURE";
++    ctx->done = 0;
++    ctx->field_order = V4L2_FIELD_ANY;
++
++    pts_track_init(&ctx->track, priv);
++
++    atomic_init(&ctx->refcount, 1);
++
++    if (priv->output_format_string) {
++        ctx->output_format = av_get_pix_fmt(priv->output_format_string);
++        if (ctx->output_format == AV_PIX_FMT_NONE) {
++            av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string);
++            return AVERROR(EINVAL);
++        }
++        if (fmt_av_to_v4l2(ctx->output_format) == 0) {
++            av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format));
++            return AVERROR(EINVAL);
++        }
++    } else {
++        // Use the input format once that is configured.
++        ctx->output_format = AV_PIX_FMT_NONE;
++    }
++
++#define STRING_OPTION(var_name, func_name, default_value) do { \
++        if (priv->var_name ## _string) { \
++            int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \
++            if (var < 0) { \
++                av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \
++                return AVERROR(EINVAL); \
++            } \
++            priv->var_name = var; \
++        } else { \
++            priv->var_name = default_value; \
++        } \
++    } while (0)
++
++    STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
++    STRING_OPTION(colour_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
++    STRING_OPTION(colour_matrix,    color_space,     AVCOL_SPC_UNSPECIFIED);
++    STRING_OPTION(chroma_location,  chroma_location, AVCHROMA_LOC_UNSPECIFIED);
++
++    return 0;
++}
++
++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
++{
++    return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE);
++}
++
++static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
++{
++    int rv;
++    DeintV4L2M2MContext * priv;
++    DeintV4L2M2MContextShared * ctx;
++
++    if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0)
++        return rv;
++
++    priv = avctx->priv;
++    ctx = priv->shared;
++
++    ctx->one_to_one = 1;
++    return 0;
++}
++
++static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext *priv = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++
++    av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n",
++           ctx->frames_rx, ctx->frames_tx);
++    ctx->done = 1;
++    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
++    pts_track_uninit(&ctx->track);
++    deint_v4l2m2m_destroy_context(ctx);
++}
++
++static const AVOption deinterlace_v4l2m2m_options[] = {
++    { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
++
++#define OFFSET(x) offsetof(DeintV4L2M2MContext, x)
++#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
++
++static const AVOption scale_v4l2m2m_options[] = {
++    { "w", "Output video width",
++      OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
++    { "h", "Output video height",
++      OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
++    { "format", "Output video format (software format of hardware frames)",
++      OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS },
++      // These colour properties match the ones of the same name in vf_scale.
++      { "out_color_matrix", "Output colour matrix coefficient set",
++      OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
++    { "out_range", "Output colour range",
++      OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED },
++      AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" },
++        { "full",    "Full range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
++        { "limited", "Limited range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
++        { "jpeg",    "Full range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
++        { "mpeg",    "Limited range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
++        { "tv",      "Limited range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
++        { "pc",      "Full range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
++    // These colour properties match the ones in the VAAPI scaler
++    { "out_color_primaries", "Output colour primaries",
++      OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING,
++      { .str = NULL }, .flags = FLAGS },
++    { "out_color_transfer", "Output colour transfer characteristics",
++      OFFSET(colour_transfer_string),  AV_OPT_TYPE_STRING,
++      { .str = NULL }, .flags = FLAGS },
++    { "out_chroma_location", "Output chroma sample location",
++      OFFSET(chroma_location_string),  AV_OPT_TYPE_STRING,
++      { .str = NULL }, .flags = FLAGS },
++    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
++    { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS },
++    { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(scale_v4l2m2m);
++
++static const AVFilterPad deint_v4l2m2m_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++    },
++};
++
++static const AVFilterPad deint_v4l2m2m_outputs[] = {
++    {
++        .name          = "default",
++        .type          = AVMEDIA_TYPE_VIDEO,
++        .config_props  = deint_v4l2m2m_config_props,
++    },
++};
++
++AVFilter ff_vf_deinterlace_v4l2m2m = {
++    .name           = "deinterlace_v4l2m2m",
++    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
++    .priv_size      = sizeof(DeintV4L2M2MContext),
++    .init           = &deint_v4l2m2m_init,
++    .uninit         = &deint_v4l2m2m_uninit,
++    FILTER_INPUTS(deint_v4l2m2m_inputs),
++    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
++    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
++    .priv_class     = &deinterlace_v4l2m2m_class,
++    .activate       = deint_v4l2m2m_activate,
++};
++
++AVFilter ff_vf_scale_v4l2m2m = {
++    .name           = "scale_v4l2m2m",
++    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"),
++    .priv_size      = sizeof(DeintV4L2M2MContext),
++    .init           = &scale_v4l2m2m_init,
++    .uninit         = &deint_v4l2m2m_uninit,
++    FILTER_INPUTS(deint_v4l2m2m_inputs),
++    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
++    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
++    .priv_class     = &scale_v4l2m2m_class,
++    .activate       = deint_v4l2m2m_activate,
++};
++
+diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
+new file mode 100644
+index 0000000000..7100f2fc9b
+--- /dev/null
++++ b/libavfilter/vf_unsand.c
+@@ -0,0 +1,228 @@
++/*
++ * Copyright (c) 2007 Bobby Bingham
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * format and noformat video filters
++ */
++
++#include <string.h>
++
++#include "libavutil/internal.h"
++#include "libavutil/mem.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/opt.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct UnsandContext {
++    const AVClass *class;
++} UnsandContext;
++
++static av_cold void uninit(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++
++    return 0;
++}
++
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterLink * const outlink = link->dst->outputs[0];
++    AVFrame *out = NULL;
++    int rv = 0;
++
++    if (outlink->format == in->format) {
++        // If nothing to do then do nothing
++        out = in;
++    }
++    else
++    {
++        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
++        {
++            rv = AVERROR(ENOMEM);
++            goto fail;
++        }
++        if (av_rpi_sand_to_planar_frame(out, in) != 0)
++        {
++            rv = -1;
++            goto fail;
++        }
++
++        av_frame_free(&in);
++    }
++
++    return ff_filter_frame(outlink, out);
++
++fail:
++    av_frame_free(&out);
++    av_frame_free(&in);
++    return rv;
++}
++
++#if 0
++static void dump_fmts(const AVFilterFormats * fmts)
++{
++    int i;
++    if (fmts== NULL) {
++        printf("NULL\n");
++        return;
++    }
++    for (i = 0; i < fmts->nb_formats; ++i) {
++        printf(" %d", fmts->formats[i]);
++    }
++    printf("\n");
++}
++#endif
++
++static int query_formats(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++    int ret;
++
++    // If we aren't connected at both ends then just do nothing
++    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
++        return 0;
++
++    // Our output formats depend on our input formats and we can't/don't
++    // want to convert between bit depths so we need to wait for the source
++    // to have an opinion before we do
++    if (ctx->inputs[0]->incfg.formats == NULL)
++        return AVERROR(EAGAIN);
++
++    // Accept anything
++    if (ctx->inputs[0]->outcfg.formats == NULL &&
++        (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0)
++        return ret;
++
++    // Filter out sand formats
++
++    // Generate a container if we don't already have one
++    if (ctx->outputs[0]->incfg.formats == NULL)
++    {
++        // Somewhat rubbish way of ensuring we have a good structure
++        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
++        AVFilterFormats *formats = ff_make_format_list(out_fmts);
++
++        if (formats == NULL)
++            return AVERROR(ENOMEM);
++        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
++            return ret;
++    }
++
++    // Replace old format list with new filtered list derived from what our
++    // input says it can do
++    {
++        const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats;
++        AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats;
++        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
++        int i;
++        int n = 0;
++        int seen_420p = 0;
++        int seen_420p10 = 0;
++
++        for (i = 0; i < src_ff->nb_formats; ++i) {
++            const enum AVPixelFormat f = src_ff->formats[i];
++
++            switch (f){
++                case AV_PIX_FMT_YUV420P:
++                case AV_PIX_FMT_SAND128:
++                case AV_PIX_FMT_RPI4_8:
++                    if (!seen_420p) {
++                        seen_420p = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
++                    }
++                    break;
++                case AV_PIX_FMT_SAND64_10:
++                case AV_PIX_FMT_YUV420P10:
++                case AV_PIX_FMT_RPI4_10:
++                    if (!seen_420p10) {
++                        seen_420p10 = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
++                    }
++                    break;
++                default:
++                    dst_fmts[n++] = f;
++                    break;
++            }
++        }
++
++        av_freep(&dst_ff->formats);
++        dst_ff->formats = dst_fmts;
++        dst_ff->nb_formats = n;
++    }
++
++//    printf("Unsand: %s calc: ", __func__);
++//    dump_fmts(ctx->outputs[0]->incfg.formats);
++
++    return 0;
++}
++
++
++#define OFFSET(x) offsetof(UnsandContext, x)
++static const AVOption unsand_options[] = {
++    { NULL }
++};
++
++
++AVFILTER_DEFINE_CLASS(unsand);
++
++static const AVFilterPad avfilter_vf_unsand_inputs[] = {
++    {
++        .name             = "default",
++        .type             = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++    { NULL }
++};
++
++static const AVFilterPad avfilter_vf_unsand_outputs[] = {
++    {
++        .name = "default",
++        .type = AVMEDIA_TYPE_VIDEO
++    },
++};
++
++AVFilter ff_vf_unsand = {
++    .name          = "unsand",
++    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
++
++    .init          = init,
++    .uninit        = uninit,
++
++    FILTER_QUERY_FUNC(query_formats),
++
++    .priv_size     = sizeof(UnsandContext),
++    .priv_class    = &unsand_class,
++
++    FILTER_INPUTS(avfilter_vf_unsand_inputs),
++    FILTER_OUTPUTS(avfilter_vf_unsand_outputs),
++};
++
+diff --git a/libavfilter/yadif.h b/libavfilter/yadif.h
+index c928911b35..e1a6037f62 100644
+--- a/libavfilter/yadif.h
++++ b/libavfilter/yadif.h
+@@ -53,6 +53,7 @@ typedef struct YADIFContext {
+     int mode;           ///< YADIFMode
+     int parity;         ///< YADIFParity
+     int deint;          ///< YADIFDeint
++    int useasm;         ///< Use any asm code
+ 
+     int frame_pending;
+ 
+diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
+index 113541bd9a..61e4c976ef 100644
+--- a/libavformat/matroskaenc.c
++++ b/libavformat/matroskaenc.c
+@@ -77,6 +77,10 @@
+ 
+ #define IS_WEBM(mkv) (CONFIG_WEBM_MUXER && CONFIG_MATROSKA_MUXER ? \
+                       ((mkv)->mode == MODE_WEBM) : CONFIG_WEBM_MUXER)
++
++/* Reserved size for H264 headers if not extant at init time */
++#define MAX_H264_HEADER_SIZE 1024
++
+ #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \
+                               !(mkv)->is_live)
+ 
+@@ -1121,8 +1125,12 @@ static int mkv_assemble_native_codecprivate(AVFormatContext *s, AVIOContext *dyn
+     case AV_CODEC_ID_WAVPACK:
+         return put_wv_codecpriv(dyn_cp, extradata, extradata_size);
+     case AV_CODEC_ID_H264:
+-        return ff_isom_write_avcc(dyn_cp, extradata,
+-                                  extradata_size);
++        if (par->extradata_size)
++            return ff_isom_write_avcc(dyn_cp, extradata,
++                                      extradata_size);
++        else
++            *size_to_reserve = MAX_H264_HEADER_SIZE;
++        break;
+     case AV_CODEC_ID_HEVC:
+         return ff_isom_write_hvcc(dyn_cp, extradata,
+                                   extradata_size, 0);
+@@ -2731,8 +2739,8 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
+         }
+         break;
+ #endif
+-    // FIXME: Remove the following once libaom starts propagating proper extradata during init()
+-    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2208
++    // FIXME: Remove the following once libaom starts propagating extradata during init()
++    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012
+     case AV_CODEC_ID_AV1:
+         if (side_data_size && mkv->track.bc && !par->extradata_size) {
+             // If the reserved space doesn't suffice, only write
+@@ -2744,6 +2752,16 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
+         } else if (!par->extradata_size)
+             return AVERROR_INVALIDDATA;
+         break;
++    // H264 V4L2 has a similar issue
++    case AV_CODEC_ID_H264:
++        if (side_data_size && mkv->track.bc && !par->extradata_size) {
++            ret = mkv_update_codecprivate(s, mkv, side_data, side_data_size,
++                                          par, mkv->track.bc, track, 0);
++            if (ret < 0)
++                return ret;
++        } else if (!par->extradata_size)
++            return AVERROR_INVALIDDATA;
++        break;
+     default:
+         if (side_data_size)
+             av_log(s, AV_LOG_DEBUG, "Ignoring new extradata in a packet for stream %d.\n", pkt->stream_index);
+diff --git a/libavformat/movenc.c b/libavformat/movenc.c
+index c4fcb5f8b1..891adbf7b2 100644
+--- a/libavformat/movenc.c
++++ b/libavformat/movenc.c
+@@ -6343,6 +6343,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
+     if (trk->par->codec_id == AV_CODEC_ID_MP4ALS ||
+             trk->par->codec_id == AV_CODEC_ID_AAC ||
+             trk->par->codec_id == AV_CODEC_ID_AV1 ||
++            trk->par->codec_id == AV_CODEC_ID_H264 ||
+             trk->par->codec_id == AV_CODEC_ID_FLAC) {
+         size_t side_size;
+         uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
+index a8d296a154..f67dc2a15a 100644
+--- a/libavformat/rtpenc.c
++++ b/libavformat/rtpenc.c
+@@ -19,6 +19,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "avc.h"
+ #include "avformat.h"
+ #include "mpegts.h"
+ #include "internal.h"
+@@ -585,8 +586,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
+         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
+         break;
+     case AV_CODEC_ID_H264:
++    {
++        uint8_t *side_data;
++        int side_data_size = 0;
++
++        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &side_data_size);
++
++        if (side_data_size != 0) {
++            int ps_size = side_data_size;
++            uint8_t * ps_buf = NULL;
++
++            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
++            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
++            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
++            av_free(ps_buf);
++        }
+         ff_rtp_send_h264_hevc(s1, pkt->data, size);
+         break;
++    }
+     case AV_CODEC_ID_H261:
+         ff_rtp_send_h261(s1, pkt->data, size);
+         break;
+diff --git a/libavutil/Makefile b/libavutil/Makefile
+index dc9012f9a8..e33f5db099 100644
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -73,6 +73,7 @@ HEADERS = adler32.h                                                     \
+           rational.h                                                    \
+           replaygain.h                                                  \
+           ripemd.h                                                      \
++	  rpi_sand_fns.h                                                \
+           samplefmt.h                                                   \
+           sha.h                                                         \
+           sha512.h                                                      \
+@@ -192,6 +193,7 @@ OBJS-$(CONFIG_MACOS_KPERF)              += macos_kperf.o
+ OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
+ OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
+ OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
++OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
+@@ -212,6 +214,7 @@ SKIPHEADERS-$(CONFIG_D3D11VA)          += hwcontext_d3d11va.h
+ SKIPHEADERS-$(CONFIG_DXVA2)            += hwcontext_dxva2.h
+ SKIPHEADERS-$(CONFIG_QSV)              += hwcontext_qsv.h
+ SKIPHEADERS-$(CONFIG_OPENCL)           += hwcontext_opencl.h
++SKIPHEADERS-$(CONFIG-RPI)              += rpi_sand_fn_pw.h
+ SKIPHEADERS-$(CONFIG_VAAPI)            += hwcontext_vaapi.h
+ SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += hwcontext_videotoolbox.h
+ SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
+diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
+index eba0151337..1b44beab39 100644
+--- a/libavutil/aarch64/Makefile
++++ b/libavutil/aarch64/Makefile
+@@ -4,3 +4,5 @@ OBJS += aarch64/cpu.o                                                 \
+ 
+ NEON-OBJS += aarch64/float_dsp_neon.o                                 \
+              aarch64/tx_float_neon.o                                  \
++             aarch64/rpi_sand_neon.o                                  \
++
+diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..2f07d9674c
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -0,0 +1,781 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#include "asm.S"
++
++// void ff_rpi_sand8_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++    // w15 contains the number of rows we need to process
++    ldr w15, [sp, #0]
++
++    // w8 will contain the number of blocks per row
++    // w8 = floor(_w/stride1)
++    // stride1 is assumed to always be 128
++    mov w8, w1
++    lsr w8, w8, #7
++
++    // in case the width of the image is not a multiple of 128, there will
++    // be an incomplete block at the end of every row
++    // w9 contains the number of pixels stored within this block
++    // w9 = _w - w8 * 128
++    lsl w9, w8, #7
++    sub w9, w7, w9
++
++    // this is the value we have to add to the src pointer after reading a complete block
++    // it will move the address to the start of the next block
++    // w10 = stride2 * stride1 - stride1 
++    mov w10, w4
++    lsl w10, w10, #7
++    sub w10, w10, #128
++
++    // w11 is the row offset, meaning the start offset of the first block of every collumn
++    // this will be increased with stride1 within every iteration of the row_loop
++    eor w11, w11, w11
++
++    // w12 = 0, processed row count
++    eor w12, w12, w12
++row_loop:
++    // start of the first block within the current row
++    // x13 = row offset + src
++    mov x13, x2
++    add x13, x13, x11
++
++    // w14 = 0, processed block count
++    eor w14, w14, w14
++
++    cmp w8, #0
++    beq no_main_y8
++
++block_loop:
++    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
++    // fortunately these aren't callee saved ones, meaning we don't need to backup them
++    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
++    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64 
++
++    // write these registers back to the destination vector and increase the dst address by 128
++    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
++    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
++
++    // move the source register to the beginning of the next block (x13 = src + block offset)
++    add x13, x13, x10
++    // increase the block counter
++    add w14, w14, #1
++
++    // continue with the block_loop if we haven't copied all full blocks yet
++    cmp w8, w14
++    bgt block_loop
++
++    // handle the last block at the end of each row
++    // at most 127 byte values copied from src to dst
++no_main_y8:
++    eor w5, w5, w5 // i = 0
++incomplete_block_loop_y8:
++    cmp w5, w9
++    bge incomplete_block_loop_end_y8
++
++    ldrb w6, [x13]
++    strb w6, [x0]
++    add x13, x13, #1
++    add x0, x0, #1
++
++    add w5, w5, #1
++    b incomplete_block_loop_y8
++incomplete_block_loop_end_y8:
++    
++   
++    // increase the row offset by 128 (stride1) 
++    add w11, w11, #128
++    // increment the row counter
++    add w12, w12, #1
++    
++    // process the next row if we haven't finished yet
++    cmp w15, w12
++    bgt row_loop
++
++    ret
++endfunc
++
++
++
++// void ff_rpi_sand8_lines_to_planar_c8(
++//   uint8_t * dst_u,           : x0
++//   unsigned int dst_stride_u, : w1 == width
++//   uint8_t * dst_v,           : x2
++//   unsigned int dst_stride_v, : w3 == width
++//   const uint8_t * src,       : x4
++//   unsigned int stride1,      : w5 == 128
++//   unsigned int stride2,      : w6
++//   unsigned int _x,           : w7
++//   unsigned int y,            : [sp, #0]
++//   unsigned int _w,           : [sp, #8]
++//   unsigned int h);           : [sp, #16]
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++    // w7 = width
++    ldr w7, [sp, #8]
++
++    // w15 contains the number of rows we need to process
++    // counts down
++    ldr w15, [sp, #16]
++
++    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
++    mov w8, w7
++    lsr w8, w8, #6
++
++    // number of pixels in block at the end of every row
++    // w9 = _w - (w8 * 64)
++    lsl w9, w8, #6
++    sub w9, w7, w9
++
++    // Skip at the end of the line to account for stride
++    sub w12, w1, w7
++
++    // address delta to the beginning of the next block
++    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
++    lsl w10, w6, #7
++    sub w10, w10, #128
++
++    // w11 = row address start offset = 0
++    eor w11, w11, w11
++
++row_loop_c8:
++    // start of the first block within the current row
++    // x13 = row offset + src
++    mov x13, x4
++    add x13, x13, x11
++
++    // w14 = 0, processed block count
++    eor w14, w14, w14
++
++    cmp w8, #0
++    beq no_main_c8
++
++block_loop_c8:
++    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values 
++    ld2 { v0.16b,  v1.16b }, [x13], #32
++    ld2 { v2.16b,  v3.16b }, [x13], #32
++    ld2 { v4.16b,  v5.16b }, [x13], #32
++    ld2 { v6.16b,  v7.16b }, [x13], #32
++
++    // swap register so that we can write them out with a single instruction
++    mov v16.16b, v1.16b
++    mov v17.16b, v3.16b
++    mov v18.16b, v5.16b
++    mov v1.16b, v2.16b
++    mov v2.16b, v4.16b
++    mov v3.16b, v6.16b
++    mov v4.16b, v16.16b
++    mov v5.16b, v17.16b
++    mov v6.16b, v18.16b
++
++    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
++    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
++
++    // increment row counter and move src to the beginning of the next block
++    add w14, w14, #1
++    add x13, x13, x10
++    
++    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
++    cmp w8, w14
++    bgt block_loop_c8
++
++no_main_c8:
++    // handle incomplete block at the end of every row
++    eor w5, w5, w5 // point counter, this might be 
++incomplete_block_loop_c8:
++    cmp w5, w9
++    bge incomplete_block_loop_end_c8
++
++    ldrb w1, [x13]
++    strb w1, [x0]
++    add x13, x13, #1
++
++    ldrb w1, [x13]
++    strb w1, [x2]
++    add x13, x13, #1
++
++    add x0, x0, #1
++    add x2, x2, #1
++
++    add w5, w5, #1
++    b incomplete_block_loop_c8
++incomplete_block_loop_end_c8:
++
++    // increase row_offset by stride1
++    add w11, w11, #128
++    add x0, x0, w12, sxtw
++    add x2, x2, w12, sxtw
++
++    // jump to row_Loop_c8 iff the row count is small than the height
++    subs w15, w15, #1
++    bgt row_loop_c8
++
++    ret
++endfunc
++
++//void ff_rpi_sand30_lines_to_planar_c16(
++//  uint8_t * dst_u,            // [x0]
++//  unsigned int dst_stride_u,  // [w1] == _w*2
++//  uint8_t * dst_v,            // [x2]
++//  unsigned int dst_stride_v,  // [w3] == _w*2
++//  const uint8_t * src,        // [x4]
++//  unsigned int stride1,       // [w5] == 128
++//  unsigned int stride2,       // [w6] 
++//  unsigned int _x,            // [w7] == 0
++//  unsigned int y,             // [sp, #0] == 0
++//  unsigned int _w,            // [sp, #8] -> w3
++//  unsigned int h);            // [sp, #16] -> w7
++
++.macro rpi_sand30_lines_to_planar_c16_block_half
++    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
++
++    xtn v4.4h, v0.4s
++    ushr v0.4s, v0.4s, #10
++    xtn v5.4h, v0.4s
++    ushr v0.4s, v0.4s, #10
++    xtn v6.4h, v0.4s
++    xtn2 v4.8h, v1.4s
++    ushr v1.4s, v1.4s, #10
++    xtn2 v5.8h, v1.4s
++    ushr v1.4s, v1.4s, #10
++    xtn2 v6.8h, v1.4s
++    and v4.16b, v4.16b, v16.16b
++    and v5.16b, v5.16b, v16.16b
++    and v6.16b, v6.16b, v16.16b
++    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
++    
++    xtn v4.4h, v2.4s
++    ushr v2.4s, v2.4s, #10
++    xtn v5.4h, v2.4s
++    ushr v2.4s, v2.4s, #10
++    xtn v6.4h, v2.4s
++    xtn2 v4.8h, v3.4s
++    ushr v3.4s, v3.4s, #10
++    xtn2 v5.8h, v3.4s
++    ushr v3.4s, v3.4s, #10
++    xtn2 v6.8h, v3.4s
++    and v4.16b, v4.16b, v16.16b
++    and v5.16b, v5.16b, v16.16b
++    and v6.16b, v6.16b, v16.16b
++    st3 { v4.8h, v5.8h, v6.8h }, [sp]
++    sub sp, sp, #48
++.endm
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++    stp x19, x20, [sp, #-48]!
++    stp x21, x22, [sp, #16]
++    stp x23, x24, [sp, #32]
++
++    ldr w3, [sp, #48+8]    // w3 = width
++    ldr w7, [sp, #48+16]   // w7 = height
++
++    // reserve space on the stack for intermediate results
++    sub sp, sp, #256
++
++    // number of 128byte blocks per row, w8 = width / 48
++    mov w9, #48
++    udiv w8, w3, w9
++
++    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
++    mul w9, w8, w9
++    sub w9, w3, w9
++
++    // row offset, the beginning of the next row to process
++    eor w10, w10, w10
++
++    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
++    lsl w11, w6, #7
++    sub w11, w11, #128
++
++    // decrease the height by one and in case of remaining pixels increase the block count by one
++    sub w7, w7, #1
++    cmp w9, #0
++    cset w19, ne    // w19 == 1 iff reamining pixels != 0
++    add w8, w8, w19
++
++    // bytes we have to move dst back by at the end of every row
++    mov w21, #48*2
++    mul w21, w21, w8
++    sub w21, w1, w21
++
++    mov w20, #0     // w20 = flag, last row processed
++
++    mov x12, #0x03ff03ff03ff03ff
++    dup v16.2d, x12
++
++    // iterate through rows, row counter = w12 = 0
++    eor w12, w12, w12
++row_loop_c16:
++    cmp w12, w7
++    bge row_loop_c16_fin
++
++    // address of row data = src + row_offset
++    mov x13, x4
++    add x13, x13, x10
++
++    eor w14, w14, w14
++block_loop_c16:
++    cmp w14, w8
++    bge block_loop_c16_fin
++
++    rpi_sand30_lines_to_planar_c16_block_half
++
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp]
++    sub sp, sp, #64
++
++    st1 { v0.8h }, [x0], #16
++    st1 { v2.8h }, [x0], #16
++    st1 { v4.8h }, [x0], #16
++    st1 { v1.8h }, [x2], #16
++    st1 { v3.8h }, [x2], #16
++    st1 { v5.8h }, [x2], #16
++
++    rpi_sand30_lines_to_planar_c16_block_half
++
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp]
++    sub sp, sp, #64
++
++    st1 { v0.8h }, [x0], #16
++    st1 { v2.8h }, [x0], #16
++    st1 { v4.8h }, [x0], #16
++    st1 { v1.8h }, [x2], #16
++    st1 { v3.8h }, [x2], #16
++    st1 { v5.8h }, [x2], #16
++
++    add x13, x13, x11 // offset to next block
++    add w14, w14, #1
++    b block_loop_c16
++block_loop_c16_fin:
++
++    add w10, w10, #128
++    add w12, w12, #1
++    add x0, x0, w21, sxtw  // move dst pointers back by x21
++    add x2, x2, w21, sxtw
++    b row_loop_c16
++row_loop_c16_fin:
++
++    cmp w20, #1
++    beq row_loop_c16_fin2
++    mov w20, #1
++    sub w8, w8, w19 // decrease block count by w19
++    add w7, w7, #1 // increase height
++    b row_loop_c16
++
++row_loop_c16_fin2:
++    sub x0, x0, w21, sxtw // readd x21 in case of the last row
++    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
++
++    // last incomplete block to be finished
++    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
++    rpi_sand30_lines_to_planar_c16_block_half
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp], #32
++    rpi_sand30_lines_to_planar_c16_block_half
++    ld2 { v0.8h, v1.8h }, [sp], #32
++    ld2 { v2.8h, v3.8h }, [sp], #32
++    ld2 { v4.8h, v5.8h }, [sp]
++    sub sp, sp, #160
++
++    mov x4, sp
++    eor w20, w20, w20
++rem_pix_c16_loop:
++    cmp w20, w9
++    bge rem_pix_c16_fin
++
++    ldr w22, [x4], #4
++    str w22, [x0], #2
++    lsr w22, w22, #16
++    str w22, [x2], #2 
++
++    add w20, w20, #1
++    b rem_pix_c16_loop
++rem_pix_c16_fin:
++
++    add sp, sp, #256
++
++    ldp x23, x24, [sp, #32]
++    ldp x21, x22, [sp, #16]
++    ldp x19, x20, [sp], #48
++    ret
++endfunc
++
++
++
++//void ff_rpi_sand30_lines_to_planar_p010(
++//  uint8_t * dest,
++//  unsigned int dst_stride,
++//  const uint8_t * src,
++//  unsigned int src_stride1,
++//  unsigned int src_stride2,
++//  unsigned int _x,
++//  unsigned int y,
++//  unsigned int _w,
++//  unsigned int h);
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7, lsl #1
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #14
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #10
++
++                shrn2           v18.8h,  v1.4s,   #14
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #10
++
++                ushr            v18.8h,  v18.8h,  #6
++                bic             v16.8h,  #0xfc,   lsl #8
++                bic             v17.8h,  #0xfc,   lsl #8
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #14
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #10
++
++                shrn2           v21.8h,  v3.4s,   #14
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #10
++
++                ushr            v21.8h,  v21.8h,  #6
++                bic             v19.8h,  #0xfc,   lsl #8
++                bic             v20.8h,  #0xfc,   lsl #8
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #14
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #10
++
++                shrn2           v24.8h,  v5.4s,   #14
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #10
++
++                ushr            v24.8h,  v24.8h,  #6
++                bic             v22.8h,  #0xfc,   lsl #8
++                bic             v23.8h,  #0xfc,   lsl #8
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #14
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #10
++
++                shrn2           v27.8h,  v7.4s,   #14
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #10
++
++                ushr            v27.8h,  v27.8h,  #6
++                bic             v25.8h,  #0xfc,   lsl #8
++                bic             v26.8h,  #0xfc,   lsl #8
++
++                blt             2f
++
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
++                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++                mov             v19.16b, v25.16b
++                mov             v20.16b, v26.16b
++                mov             v21.16b, v27.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v19.16b
++                mov             v17.16b, v20.16b
++                sub             w5,  w5,  #24
++                mov             v18.16b, v21.16b
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #12
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #6
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #3
++                mov             v17.4h[0], v17.4h[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.h, v17.h}[0], [x0], #4
++                b               11b
++1:
++                st1             {v16.h}[0], [x0], #2
++                b               11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                sub             w1,  w1,  w7
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++
++                shrn            v18.4h,  v0.4s,   #16
++                xtn             v16.4h,  v0.4s
++                shrn            v17.4h,  v0.4s,   #12
++
++                shrn2           v18.8h,  v1.4s,   #16
++                xtn2            v16.8h,  v1.4s
++                shrn2           v17.8h,  v1.4s,   #12
++
++                shrn            v18.8b,  v18.8h,  #6
++                shrn            v16.8b,  v16.8h,  #2
++                xtn             v17.8b,  v17.8h
++
++                // v2, v3
++
++                shrn            v21.4h,  v2.4s,   #16
++                xtn             v19.4h,  v2.4s
++                shrn            v20.4h,  v2.4s,   #12
++
++                shrn2           v21.8h,  v3.4s,   #16
++                xtn2            v19.8h,  v3.4s
++                shrn2           v20.8h,  v3.4s,   #12
++
++                shrn2           v18.16b, v21.8h,  #6
++                shrn2           v16.16b, v19.8h,  #2
++                xtn2            v17.16b, v20.8h
++
++                // v4, v5
++
++                shrn            v24.4h,  v4.4s,   #16
++                xtn             v22.4h,  v4.4s
++                shrn            v23.4h,  v4.4s,   #12
++
++                shrn2           v24.8h,  v5.4s,   #16
++                xtn2            v22.8h,  v5.4s
++                shrn2           v23.8h,  v5.4s,   #12
++
++                shrn            v21.8b,  v24.8h,  #6
++                shrn            v19.8b,  v22.8h,  #2
++                xtn             v20.8b,  v23.8h
++
++                // v6, v7
++
++                shrn            v27.4h,  v6.4s,   #16
++                xtn             v25.4h,  v6.4s
++                shrn            v26.4h,  v6.4s,   #12
++
++                shrn2           v27.8h,  v7.4s,   #16
++                xtn2            v25.8h,  v7.4s
++                shrn2           v26.8h,  v7.4s,   #12
++
++                shrn2           v21.16b, v27.8h,  #6
++                shrn2           v19.16b, v25.8h,  #2
++                xtn2            v20.16b, v26.8h
++
++                blt             2f
++
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #24
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #12
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #6
++                mov             v17.4h[0], v17.4h[1]
++                mov             v18.4h[0], v18.4h[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                beq             11b
++                mov             v16.8b[0], v16.8b[1]
++                sub             w5,  w5,  #3
++                mov             v17.8b[0], v17.8b[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.b, v17.b}[0], [x0], #2
++                b               11b
++1:
++                st1             {v16.b}[0], [x0], #1
++                b               11b
++
++endfunc
++
+diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
+new file mode 100644
+index 0000000000..2a56135bc3
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.h
+@@ -0,0 +1,59 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#pragma once
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
++  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
++  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
++  unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
++  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
++  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++#ifdef __cplusplus
++}
++#endif
++
+diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
+index 5da44b0542..b74b7c4e2f 100644
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+ 
+ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
+              arm/float_dsp_neon.o                                       \
++             arm/rpi_sand_neon.o                                        \
+diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..60e697f681
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,925 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "libavutil/arm/asm.S"
++
++
++@ General notes:
++@ Having done some timing on this in sand8->y8 (Pi4)
++@  vst1 (680fps) is a bit faster than vstm (660fps)
++@  vldm (680fps) is noticably faster than vld1 (480fps)
++@  (or it might be that a mix is what is required)
++@
++@ At least on a Pi4 it is no more expensive to have a single auto-inc register
++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
++@ the latter was better)
++@
++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
++@ the memory is uncached.
++@ As these are Sand -> planar we can assume that src is going to be aligned but
++@ it is possible that dest isn't (converting to .yuv or other packed format).
++@ Luckily vst1 is faster than vstm :-) so all is well
++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
++@ .8 stores would let us do non-word aligned stores into uncached but it
++@ probably isn't worth it.
++
++
++
++
++@ void ff_rpi_sand128b_stripe_to_8_10(
++@   uint8_t * dest,             // [r0]
++@   const uint8_t * src1,       // [r1]
++@   const uint8_t * src2,       // [r2]
++@   unsigned int lines);        // [r3]
++
++.macro  stripe2_to_8, bit_depth
++        vpush    {q4-q7}
++1:
++        vldm     r1!, {q0-q7}
++        subs     r3, #1
++        vldm     r2!, {q8-q15}
++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
++        vqrshrn.u16 d10, q10, #\bit_depth - 8
++        vqrshrn.u16 d11, q11, #\bit_depth - 8
++        vqrshrn.u16 d12, q12, #\bit_depth - 8
++        vqrshrn.u16 d13, q13, #\bit_depth - 8
++        vqrshrn.u16 d14, q14, #\bit_depth - 8
++        vqrshrn.u16 d15, q15, #\bit_depth - 8
++        vstm     r0!, {q0-q7}
++        bne      1b
++        vpop     {q4-q7}
++        bx       lr
++.endm
++
++function ff_rpi_sand128b_stripe_to_8_10, export=1
++        stripe2_to_8     10
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24            L
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                lsl             r3,  #7
++                sub             r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++                mov             lr,  #0
++1:
++                vldm            r2,  {q8-q15}
++                add             r2,  r3
++                subs            r5,  #128
++                blt             2f
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                vst1.8          {d20, d21, d22, d23}, [r0]!
++                vst1.8          {d24, d25, d26, d27}, [r0]!
++                vst1.8          {d28, d29, d30, d31}, [r0]!
++                bne             1b
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #64-128
++                blt             1f
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                vst1.8          {d20, d21, d22, d23}, [r0]!
++                beq             11b
++                vmov            q8,  q12
++                vmov            q9,  q13
++                sub             r5,  #64
++                vmov            q10, q14
++                vmov            q11, q15
++1:
++                cmp             r5,  #32-128
++                blt             1f
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                beq             11b
++                vmov            q8,  q10
++                sub             r5,  #32
++                vmov            q9,  q11
++1:
++                cmp             r5,  #16-128
++                blt             1f
++                vst1.8          {d16, d17}, [r0]!
++                beq             11b
++                sub             r5,  #16
++                vmov            q8,  q9
++1:
++                cmp             r5,  #8-128
++                blt             1f
++                vst1.8          {d16}, [r0]!
++                beq             11b
++                sub             r5,  #8
++                vmov            d16, d17
++1:
++                cmp             r5,  #4-128
++                blt             1f
++                vst1.32         {d16[0]}, [r0]!
++                beq             11b
++                sub             r5,  #4
++                vshr.u64        d16, #32
++1:
++                cmp             r5,  #2-128
++                blt             1f
++                vst1.16         {d16[0]}, [r0]!
++                beq             11b
++                vst1.8          {d16[2]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d16[0]}, [r0]!
++                b               11b
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_c8(
++@   uint8_t * dst_u,            // [r0]
++@   unsigned int dst_stride_u,  // [r1]
++@   uint8_t * dst_v,            // [r2]
++@   unsigned int dst_stride_v,  // [r3]
++@   const uint8_t * src,        // [sp, #0]  -> r4, r5
++@   unsigned int stride1,       // [sp, #4]  128
++@   unsigned int stride2,       // [sp, #8]  -> r8
++@   unsigned int _x,            // [sp, #12] 0
++@   unsigned int y,             // [sp, #16] (r7 in prefix)
++@   unsigned int _w,            // [sp, #20] -> r12, r6
++@   unsigned int h);            // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++                push            {r4-r8, lr}     @ +24
++
++                ldr             r5,  [sp, #24]
++                ldr             r8,  [sp, #32]
++                ldr             r7,  [sp, #40]
++                ldr             r6,  [sp, #44]
++                lsl             r8,  #7
++                add             r5,  r5,  r7,  lsl #7
++                sub             r1,  r1,  r6
++                sub             r3,  r3,  r6
++                ldr             r7,  [sp, #48]
++                vpush           {q4-q7}
++
++10:
++                mov             r4,  r5
++                mov             r12, r6
++1:
++                subs            r12, #64
++                vldm            r4,  {q0-q7}
++                add             r4,  r8
++                it              gt
++                vldmgt          r4,  {q8-q15}
++                add             r4,  r8
++
++                vuzp.8          q0,  q1
++                vuzp.8          q2,  q3
++                vuzp.8          q4,  q5
++                vuzp.8          q6,  q7
++
++                vuzp.8          q8,  q9
++                vuzp.8          q10, q11
++                vuzp.8          q12, q13
++                vuzp.8          q14, q15
++                subs            r12, #64
++
++                @ Rearrange regs so we can use vst1 with 4 regs
++                vswp            q1,  q2
++                vswp            q5,  q6
++                vswp            q9,  q10
++                vswp            q13, q14
++                blt             2f
++
++                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
++                vst1.8          {d8,  d9,  d10, d11}, [r0]!
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                vst1.8          {d24, d25, d26, d27}, [r0]!
++
++                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
++                vst1.8          {d12, d13, d14, d15}, [r2]!
++                vst1.8          {d20, d21, d22, d23}, [r2]!
++                vst1.8          {d28, d29, d30, d31}, [r2]!
++                bne             1b
++11:
++                subs            r7,  #1
++                add             r5,  #128
++                add             r0,  r1
++                add             r2,  r3
++                bne             10b
++                vpop            {q4-q7}
++                pop             {r4-r8,pc}
++
++2:
++                cmp             r12, #64-128
++                blt             1f
++                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
++                vst1.8          {d8,  d9,  d10, d11}, [r0]!
++                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
++                vst1.8          {d12, d13, d14, d15}, [r2]!
++                beq             11b
++                sub             r12, #64
++                vmov            q0,  q8
++                vmov            q1,  q9
++                vmov            q2,  q10
++                vmov            q3,  q11
++                vmov            q4,  q12
++                vmov            q5,  q13
++                vmov            q6,  q14
++                vmov            q7,  q15
++1:
++                cmp             r12, #32-128
++                blt             1f
++                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
++                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
++                beq             11b
++                sub             r12, #32
++                vmov            q0,  q4
++                vmov            q1,  q5
++                vmov            q2,  q6
++                vmov            q3,  q7
++1:
++                cmp             r12, #16-128
++                blt             1f
++                vst1.8          {d0,  d1 }, [r0]!
++                vst1.8          {d4,  d5 }, [r2]!
++                beq             11b
++                sub             r12, #16
++                vmov            q0,  q1
++                vmov            q2,  q3
++1:
++                cmp             r12, #8-128
++                blt             1f
++                vst1.8          {d0}, [r0]!
++                vst1.8          {d4}, [r2]!
++                beq             11b
++                sub             r12, #8
++                vmov            d0,  d1
++                vmov            d4,  d5
++1:
++                cmp             r12, #4-128
++                blt             1f
++                vst1.32         {d0[0]}, [r0]!
++                vst1.32         {d4[0]}, [r2]!
++                beq             11b
++                sub             r12, #4
++                vmov            s0,  s1
++                vmov            s8,  s9
++1:
++                cmp             r12, #2-128
++                blt             1f
++                vst1.16         {d0[0]}, [r0]!
++                vst1.16         {d4[0]}, [r2]!
++                beq             11b
++                vst1.8          {d0[2]}, [r0]!
++                vst1.8          {d4[2]}, [r2]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                vst1.8          {d4[0]}, [r2]!
++                b               11b
++endfunc
++
++
++
++@ void ff_rpi_sand30_lines_to_planar_y16(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                sub             r3,  #1
++                lsl             r3,  #7
++                sub             r1,  r1,  r6,  lsl #1
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++                mov             lr,  #0
++1:
++                vldm            r2!, {q10-q13}
++                add             lr,  #64
++
++                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
++                ands            lr,  #127
++                vshrn.u32       d2,  q10, #10
++                vmovn.u32       d0,  q10
++
++                vshrn.u32       d5,  q11, #14
++                it              eq
++                addeq           r2,  r3
++                vshrn.u32       d3,  q11, #10
++                vmovn.u32       d1,  q11
++
++                subs            r5,  #48
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
++
++                vshrn.u32       d20, q12, #14
++                vshrn.u32       d18, q12, #10
++                vmovn.u32       d16, q12
++
++                vshrn.u32       d21, q13, #14
++                vshrn.u32       d19, q13, #10
++                vmovn.u32       d17, q13
++
++                vshr.u16        q10, #6
++                vbic.u16        q8,  #0xfc00
++                vbic.u16        q9 , #0xfc00
++                blt             2f
++
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4], r12
++                vst3.16         {d16, d18, d20}, [r0], r12
++                vst3.16         {d17, d19, d21}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #24-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4]
++                beq             11b
++                vmov            q0,  q8
++                sub             r5,  #24
++                vmov            q1,  q9
++                vmov            q2,  q10
++1:
++                cmp             r5,  #12-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0]!
++                beq             11b
++                vmov            d0, d1
++                sub             r5, #12
++                vmov            d2, d3
++                vmov            d4, d5
++1:
++                cmp             r5,  #6-48
++                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
++                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
++                add             r0,  #12
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #6
++                vmov            s4,  s5
++                vmov            s8,  s9
++1:
++                cmp             r5, #3-48
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #16
++                vshr.u32        d2, #16
++1:
++                cmp             r5, #2-48
++                blt             1f
++                vst2.16         {d0[0], d2[0]}, [r0]!
++                b               11b
++1:
++                vst1.16         {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_c16(
++@   uint8_t * dst_u,            // [r0]
++@   unsigned int dst_stride_u,  // [r1]
++@   uint8_t * dst_v,            // [r2]
++@   unsigned int dst_stride_v,  // [r3]
++@   const uint8_t * src,        // [sp, #0]  -> r4, r5
++@   unsigned int stride1,       // [sp, #4]  128
++@   unsigned int stride2,       // [sp, #8]  -> r8
++@   unsigned int _x,            // [sp, #12] 0
++@   unsigned int y,             // [sp, #16] (r7 in prefix)
++@   unsigned int _w,            // [sp, #20] -> r6, r9
++@   unsigned int h);            // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++                push            {r4-r10, lr}    @ +32
++                ldr             r5,  [sp, #32]
++                ldr             r8,  [sp, #40]
++                ldr             r7,  [sp, #48]
++                ldr             r9,  [sp, #52]
++                mov             r12, #48
++                sub             r8,  #1
++                lsl             r8,  #7
++                add             r5,  r5,  r7,  lsl #7
++                sub             r1,  r1,  r9,  lsl #1
++                sub             r3,  r3,  r9,  lsl #1
++                ldr             r7,  [sp, #56]
++10:
++                mov             lr,  #0
++                mov             r4,  r5
++                mov             r6,  r9
++1:
++                vldm            r4!, {q0-q3}
++                add             lr,  #64
++
++                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
++                vshrn.u32       d20, q0,  #14
++                vmovn.u32       d18, q0
++                vshrn.u32       d0,  q0,  #10
++                ands            lr,  #127
++
++                vshrn.u32       d21, q1,  #14
++                vmovn.u32       d19, q1
++                vshrn.u32       d1,  q1,  #10
++
++                vshrn.u32       d22, q2,  #10
++                vmovn.u32       d2,  q2
++                vshrn.u32       d4,  q2,  #14
++
++                add             r10, r0,  #24
++                vshrn.u32       d23, q3,  #10
++                vmovn.u32       d3,  q3
++                vshrn.u32       d5,  q3,  #14
++
++                it              eq
++                addeq           r4,  r8
++                vuzp.16         q0,  q11
++                vuzp.16         q9,  q1
++                vuzp.16         q10, q2
++
++                @ q0   V0, V3,..
++                @ q9   U0, U3...
++                @ q10  U1, U4...
++                @ q11  U2, U5,..
++                @ q1   V1, V4,
++                @ q2   V2, V5,..
++
++                subs            r6,  #24
++                vbic.u16        q11, #0xfc00
++                vbic.u16        q9,  #0xfc00
++                vshr.u16        q10, #6
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
++
++                blt             2f
++
++                vst3.16         {d18, d20, d22}, [r0],  r12
++                vst3.16         {d19, d21, d23}, [r10]
++                add             r10, r2,  #24
++                vst3.16         {d0,  d2,  d4},  [r2],  r12
++                vst3.16         {d1,  d3,  d5},  [r10]
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r5,  #128
++                add             r0,  r1
++                add             r2,  r3
++                bne             10b
++
++                pop             {r4-r10, pc}
++
++@ Partial final write
++2:
++                cmp             r6,  #-12
++                blt             1f
++                vst3.16         {d18, d20, d22}, [r0]!
++                vst3.16         {d0,  d2,  d4},  [r2]!
++                beq             11b
++                vmov            d18, d19
++                vmov            d20, d21
++                vmov            d22, d23
++                sub             r6,  #12
++                vmov            d0,  d1
++                vmov            d2,  d3
++                vmov            d4,  d5
++1:
++                cmp             r6,  #-18
++                @ Rezip here as it makes the remaining tail handling easier
++                vzip.16         d0,  d18
++                vzip.16         d2,  d20
++                vzip.16         d4,  d22
++                blt             1f
++                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
++                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
++                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
++                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
++                beq             11b
++                vmov            d0,  d18
++                vmov            d2,  d20
++                sub             r6,  #6
++                vmov            d4,  d22
++1:
++                cmp             r6,  #-21
++                blt             1f
++                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
++                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
++                beq             11b
++                vmov            s4,  s5
++                sub             r6,  #3
++                vmov            s0,  s1
++1:
++                cmp             r6,  #-22
++                blt             1f
++                vst2.16         {d0[1], d2[1]}, [r0]!
++                vst2.16         {d0[0], d2[0]}, [r2]!
++                b               11b
++1:
++                vst1.16         {d0[1]}, [r0]!
++                vst1.16         {d0[0]}, [r2]!
++                b               11b
++
++endfunc
++
++@ void ff_rpi_sand30_lines_to_planar_p010(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_p010, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                vmov.u16        q15, #0xffc0
++                sub             r3,  #1
++                lsl             r3,  #7
++                sub             r1,  r1,  r6,  lsl #1
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++                mov             lr,  #0
++1:
++                vldm            r2!, {q10-q13}
++                add             lr,  #64
++
++                vshl.u32        q14, q10, #6
++                ands            lr,  #127
++                vshrn.u32       d4,  q10, #14
++                vshrn.u32       d2,  q10, #4
++                vmovn.u32       d0,  q14
++
++                vshl.u32        q14, q11, #6
++                it              eq
++                addeq           r2,  r3
++                vshrn.u32       d5,  q11, #14
++                vshrn.u32       d3,  q11, #4
++                vmovn.u32       d1,  q14
++
++                subs            r5,  #48
++                vand            q2,  q15
++                vand            q1,  q15
++                vand            q0,  q15
++
++                vshl.u32        q14, q12, #6
++                vshrn.u32       d20, q12, #14
++                vshrn.u32       d18, q12, #4
++                vmovn.u32       d16, q14
++
++                vshl.u32        q14, q13, #6
++                vshrn.u32       d21, q13, #14
++                vshrn.u32       d19, q13, #4
++                vmovn.u32       d17, q14
++
++                vand            q10, q15
++                vand            q9,  q15
++                vand            q8,  q15
++                blt             2f
++
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4], r12
++                vst3.16         {d16, d18, d20}, [r0], r12
++                vst3.16         {d17, d19, d21}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #24-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4]
++                beq             11b
++                vmov            q0,  q8
++                sub             r5,  #24
++                vmov            q1,  q9
++                vmov            q2,  q10
++1:
++                cmp             r5,  #12-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0]!
++                beq             11b
++                vmov            d0, d1
++                sub             r5, #12
++                vmov            d2, d3
++                vmov            d4, d5
++1:
++                cmp             r5,  #6-48
++                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
++                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
++                add             r0,  #12
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #6
++                vmov            s4,  s5
++                vmov            s8,  s9
++1:
++                cmp             r5, #3-48
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #16
++                vshr.u32        d2, #16
++1:
++                cmp             r5, #2-48
++                blt             1f
++                vst2.16         {d0[0], d2[0]}, [r0]!
++                b               11b
++1:
++                vst1.16         {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                lsl             r3,  #7
++                sub             r1,  r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++1:
++                vldm            r2,  {q8-q15}
++
++                subs            r5,  #96
++
++                vmovn.u32       d0,  q8
++                vshrn.u32       d2,  q8,  #12
++                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
++
++                add             r2,  r3
++
++                vmovn.u32       d1,  q9
++                vshrn.u32       d3,  q9,  #12
++                vshrn.u32       d5,  q9,  #16
++
++                pld             [r2, #0]
++
++                vshrn.u16       d0,  q0,  #2
++                vmovn.u16       d1,  q1
++                vshrn.u16       d2,  q2,  #6
++
++                vmovn.u32       d16, q10
++                vshrn.u32       d18, q10, #12
++                vshrn.u32       d20, q10, #16
++
++                vmovn.u32       d17, q11
++                vshrn.u32       d19, q11, #12
++                vshrn.u32       d21, q11, #16
++
++                pld             [r2, #64]
++
++                vshrn.u16       d4,  q8,  #2
++                vmovn.u16       d5,  q9
++                vshrn.u16       d6,  q10, #6
++
++                vmovn.u32       d16, q12
++                vshrn.u32       d18, q12, #12
++                vshrn.u32       d20, q12, #16
++
++                vmovn.u32       d17, q13
++                vshrn.u32       d19, q13, #12
++                vshrn.u32       d21, q13, #16
++
++                vshrn.u16       d16, q8,  #2
++                vmovn.u16       d17, q9
++                vshrn.u16       d18, q10, #6
++
++                vmovn.u32       d20, q14
++                vshrn.u32       d22, q14, #12
++                vshrn.u32       d24, q14, #16
++
++                vmovn.u32       d21, q15
++                vshrn.u32       d23, q15, #12
++                vshrn.u32       d25, q15, #16
++
++                vshrn.u16       d20, q10, #2
++                vmovn.u16       d21, q11
++                vshrn.u16       d22, q12, #6
++
++                blt             2f
++
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                vst3.8          {d16, d17, d18}, [r0], r12
++                vst3.8          {d20, d21, d22}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #48-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                beq             11b
++                vmov            q0,  q8
++                vmov            q2,  q10
++                sub             r5,  #48
++                vmov            d2,  d18
++                vmov            d6,  d22
++1:
++                cmp             r5,  #24-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0]!
++                beq             11b
++                vmov            q0,  q2
++                sub             r5,  #24
++                vmov            d2,  d6
++1:
++                cmp             r5,  #12-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
++                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #12
++                vmov            s2,  s3
++                vmov            s4,  s5
++1:
++                cmp             r5,  #6-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                add             r0,  #12
++                beq             11b
++                vshr.u32        d0,  #16
++                sub             r5,  #6
++                vshr.u32        d1,  #16
++                vshr.u32        d2,  #16
++1:
++                cmp             r5, #3-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #8
++                vshr.u32        d1, #8
++1:
++                cmp             r5, #2-96
++                blt             1f
++                vst2.8          {d0[0], d1[0]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
++
+diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
+new file mode 100644
+index 0000000000..d457c10870
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.h
+@@ -0,0 +1,110 @@
++/*
++Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_ARM_SAND_NEON_H
++#define AVUTIL_ARM_SAND_NEON_H
++
++void ff_rpi_sand128b_stripe_to_8_10(
++  uint8_t * dest,             // [r0]
++  const uint8_t * src1,       // [r1]
++  const uint8_t * src2,       // [r2]
++  unsigned int lines);        // [r3]
++
++void ff_rpi_sand8_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++void ff_rpi_sand8_lines_to_planar_c8(
++  uint8_t * dst_u,            // [r0]
++  unsigned int dst_stride_u,  // [r1]
++  uint8_t * dst_v,            // [r2]
++  unsigned int dst_stride_v,  // [r3]
++  const uint8_t * src,        // [sp, #0]  -> r4, r5
++  unsigned int stride1,       // [sp, #4]  128
++  unsigned int stride2,       // [sp, #8]  -> r8
++  unsigned int _x,            // [sp, #12] 0
++  unsigned int y,             // [sp, #16] (r7 in prefix)
++  unsigned int _w,            // [sp, #20] -> r12, r6
++  unsigned int h);            // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y16(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_c16(
++  uint8_t * dst_u,            // [r0]
++  unsigned int dst_stride_u,  // [r1]
++  uint8_t * dst_v,            // [r2]
++  unsigned int dst_stride_v,  // [r3]
++  const uint8_t * src,        // [sp, #0]  -> r4, r5
++  unsigned int stride1,       // [sp, #4]  128
++  unsigned int stride2,       // [sp, #8]  -> r8
++  unsigned int _x,            // [sp, #12] 0
++  unsigned int y,             // [sp, #16] (r7 in prefix)
++  unsigned int _w,            // [sp, #20] -> r6, r9
++  unsigned int h);            // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_p010(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++#endif // AVUTIL_ARM_SAND_NEON_H
++
+diff --git a/libavutil/frame.c b/libavutil/frame.c
+index 9545477acc..48621e4098 100644
+--- a/libavutil/frame.c
++++ b/libavutil/frame.c
+@@ -16,6 +16,8 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -27,6 +29,9 @@
+ #include "mem.h"
+ #include "samplefmt.h"
+ #include "hwcontext.h"
++#if CONFIG_SAND
++#include "rpi_sand_fns.h"
++#endif
+ 
+ #if FF_API_OLD_CHANNEL_LAYOUT
+ #define CHECK_CHANNELS_CONSISTENCY(frame) \
+@@ -874,6 +879,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
+         (frame->crop_top + frame->crop_bottom) >= frame->height)
+         return AVERROR(ERANGE);
+ 
++#if CONFIG_SAND
++    // Sand cannot be cropped - do not try
++    if (av_rpi_is_sand_format(frame->format))
++        return 0;
++#endif
++
+     desc = av_pix_fmt_desc_get(frame->format);
+     if (!desc)
+         return AVERROR_BUG;
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index 2580269549..3a9d323325 100644
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -957,6 +957,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
+  */
+ const char *av_frame_side_data_name(enum AVFrameSideDataType type);
+ 
++
++static inline int av_frame_cropped_width(const AVFrame * const frame)
++{
++    return frame->width - (frame->crop_left + frame->crop_right);
++}
++static inline int av_frame_cropped_height(const AVFrame * const frame)
++{
++    return frame->height - (frame->crop_top + frame->crop_bottom);
++}
++
+ /**
+  * @}
+  */
+diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
+index 7a9fdbd263..137a952d2c 100644
+--- a/libavutil/hwcontext_drm.c
++++ b/libavutil/hwcontext_drm.c
+@@ -21,6 +21,7 @@
+ #include <fcntl.h>
+ #include <sys/mman.h>
+ #include <unistd.h>
++#include <sys/ioctl.h>
+ 
+ /* This was introduced in version 4.6. And may not exist all without an
+  * optional package. So to prevent a hard dependency on needing the Linux
+@@ -31,6 +32,7 @@
+ #endif
+ 
+ #include <drm.h>
++#include <libdrm/drm_fourcc.h>
+ #include <xf86drm.h>
+ 
+ #include "avassert.h"
+@@ -38,7 +40,9 @@
+ #include "hwcontext_drm.h"
+ #include "hwcontext_internal.h"
+ #include "imgutils.h"
+-
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ static void drm_device_free(AVHWDeviceContext *hwdev)
+ {
+@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
+     AVDRMDeviceContext *hwctx = hwdev->hwctx;
+     drmVersionPtr version;
+ 
++    if (device == NULL) {
++        hwctx->fd = -1;
++        return 0;
++    }
++
+     hwctx->fd = open(device, O_RDWR);
+     if (hwctx->fd < 0)
+         return AVERROR(errno);
+@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
+     if (flags & AV_HWFRAME_MAP_WRITE)
+         mmap_prot |= PROT_WRITE;
+ 
++    if (dst->format == AV_PIX_FMT_NONE)
++        dst->format = hwfc->sw_format;
+ #if HAVE_LINUX_DMA_BUF_H
+     if (flags & AV_HWFRAME_MAP_READ)
+         map->sync_flags |= DMA_BUF_SYNC_READ;
+@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
+ 
+     dst->width  = src->width;
+     dst->height = src->height;
++    dst->crop_top    = src->crop_top;
++    dst->crop_bottom = src->crop_bottom;
++    dst->crop_left   = src->crop_left;
++    dst->crop_right  = src->crop_right;
++
++#if CONFIG_SAND
++    // Rework for sand frames
++    if (av_rpi_is_sand_frame(dst)) {
++        // As it stands the sand formats hold stride2 in linesize[3]
++        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
++        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
++        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
++        dst->linesize[0] = 128;
++        dst->linesize[1] = 128;
++        // *** Are we sure src->height is actually what we want ???
++    }
++#endif
+ 
+     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
+                                 &drm_unmap_frame, map);
+@@ -206,16 +234,29 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
+                                     enum AVHWFrameTransferDirection dir,
+                                     enum AVPixelFormat **formats)
+ {
+-    enum AVPixelFormat *pix_fmts;
++    enum AVPixelFormat *p;
+ 
+-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+-    if (!pix_fmts)
++    p = *formats = av_malloc_array(3, sizeof(*p));
++    if (!p)
+         return AVERROR(ENOMEM);
+ 
+-    pix_fmts[0] = ctx->sw_format;
+-    pix_fmts[1] = AV_PIX_FMT_NONE;
++    // **** Offer native sand too ????
++    *p++ =
++#if CONFIG_SAND
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
++            AV_PIX_FMT_YUV420P :
++        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
++            AV_PIX_FMT_YUV420P10LE :
++#endif
++            ctx->sw_format;
++
++#if CONFIG_SAND
++    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++        *p++ = AV_PIX_FMT_NV12;
++#endif
+ 
+-    *formats = pix_fmts;
++    *p = AV_PIX_FMT_NONE;
+     return 0;
+ }
+ 
+@@ -231,18 +272,62 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
+     map = av_frame_alloc();
+     if (!map)
+         return AVERROR(ENOMEM);
+-    map->format = dst->format;
+ 
++    // Map to default
++    map->format = AV_PIX_FMT_NONE;
+     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
+     if (err)
+         goto fail;
+ 
+-    map->width  = dst->width;
+-    map->height = dst->height;
++#if 0
++    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
++           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
++           map->width, map->height,
++           map->linesize[0],
++           map->linesize[1],
++           map->linesize[2],
++           map->linesize[3],
++           dst->width, dst->height,
++           dst->linesize[0],
++           dst->linesize[1],
++           dst->linesize[2]);
++#endif
++#if CONFIG_SAND
++    if (av_rpi_is_sand_frame(map)) {
++        // Preserve crop - later ffmpeg code assumes that we have in that it
++        // overwrites any crop that we create with the old values
++        const unsigned int w = FFMIN(dst->width, map->width);
++        const unsigned int h = FFMIN(dst->height, map->height);
++
++        map->crop_top = 0;
++        map->crop_bottom = 0;
++        map->crop_left = 0;
++        map->crop_right = 0;
++
++        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
++        {
++            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
++            err = AVERROR(EINVAL);
++            goto fail;
++        }
++
++        dst->width = w;
++        dst->height = h;
++    }
++    else
++#endif
++    {
++        // Kludge mapped h/w s.t. frame_copy works
++        map->width  = dst->width;
++        map->height = dst->height;
++        err = av_frame_copy(dst, map);
++    }
+ 
+-    err = av_frame_copy(dst, map);
+     if (err)
++    {
++        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
+         goto fail;
++    }
+ 
+     err = 0;
+ fail:
+@@ -257,7 +342,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
+     int err;
+ 
+     if (src->width > hwfc->width || src->height > hwfc->height)
++    {
++        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
+         return AVERROR(EINVAL);
++    }
+ 
+     map = av_frame_alloc();
+     if (!map)
+diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
+index 2a9b5f4aac..11e7945f18 100644
+--- a/libavutil/hwcontext_vulkan.c
++++ b/libavutil/hwcontext_vulkan.c
+@@ -57,6 +57,14 @@
+ #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x)
+ #endif
+ 
++// Sometimes missing definitions
++#ifndef VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME
++#define VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME "VK_EXT_video_decode_h264"
++#endif
++#ifndef VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME
++#define VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME "VK_EXT_video_decode_h265"
++#endif
++
+ typedef struct VulkanQueueCtx {
+     VkFence fence;
+     VkQueue queue;
+diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+index 62a2ae08d9..cb73521ea7 100644
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2717,6 +2717,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+         .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_FLOAT |
+                  AV_PIX_FMT_FLAG_ALPHA,
+     },
++    [AV_PIX_FMT_SAND128] = {
++        .name = "sand128",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 1, 0, 0, 8 },        /* Y */
++            { 1, 2, 0, 0, 8 },        /* U */
++            { 1, 2, 1, 0, 8 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_SAND64_10] = {
++        .name = "sand64_10",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 10 },        /* Y */
++            { 1, 4, 0, 0, 10 },        /* U */
++            { 1, 4, 2, 0, 10 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_SAND64_16] = {
++        .name = "sand64_16",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 16 },        /* Y */
++            { 1, 4, 0, 0, 16 },        /* U */
++            { 1, 4, 2, 0, 16 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_RPI4_8] = {
++        .name = "rpi4_8",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
++    [AV_PIX_FMT_RPI4_10] = {
++        .name = "rpi4_10",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
+ };
+ 
+ static const char * const color_range_names[] = {
+diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+index 37c2c79e01..5cc780e7d5 100644
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -377,6 +377,14 @@ enum AVPixelFormat {
+ 
+     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
+     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
++// RPI - not on ifdef so can be got at by calling progs
++// #define so code that uses this can know it is there
++#define AVUTIL_HAVE_PIX_FMT_SAND 1
++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_RPI4_8,
++    AV_PIX_FMT_RPI4_10,
+ 
+     AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined
+     AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined
+diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+new file mode 100644
+index 0000000000..0d5d203dc3
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,227 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x;
++    const unsigned int w = _w;
++    const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
++                                     src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++            memcpy(dst, p, w);
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const uint8_t * p = p2;
++            uint8_t * d = dst;
++            memcpy(d, p1, w1);
++            d += w1;
++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++                memcpy(d, p, stride1);
++            }
++            memcpy(d, p, w3);
++        }
++    }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
++                                     src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            const pixel * p = (const pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * p = (const pixel *)p1;
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *du++ = *p++;
++                    *dv++ = *p++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *p++ = *su++;
++                    *p++ = *sv++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+new file mode 100644
+index 0000000000..b6071e2928
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,445 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++#include "frame.h"
++
++#if ARCH_ARM && HAVE_NEON
++#include "arm/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#elif ARCH_AARCH64 && HAVE_NEON
++#include "aarch64/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#else
++#define HAVE_SAND_ASM 0
++#endif
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    const unsigned int rnd = (1 << shr) >> 1;
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        *dst++ = (*src++ + rnd) >> shr;
++    }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    unsigned int rnd = (1 << shr) >> 1;
++    const unsigned int mask = ((1 << shr) - 1);
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        rnd = *src++ + (rnd & mask);
++        *dst++ = rnd >> shr;
++    }
++}
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint16_t * d = (uint16_t *)dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 10) & 0x3ff;
++            *d++ = (p3 >> 20) & 0x3ff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = p3 & 0x3ff;
++            *d++ = (p3 >> 10) & 0x3ff;
++            *d++ = (p3 >> 20) & 0x3ff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = p3 & 0x3ff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 10) & 0x3ff;
++        }
++    }
++}
++
++
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 8;
++    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
++                                       src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint16_t * du = (uint16_t *)dst_u;
++        uint16_t * dv = (uint16_t *)dst_v;
++
++        if (xskip0 != 0) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            if (xskip0 == 1)
++            {
++                *du++ = (p3a >> 20) & 0x3ff;
++                *dv++ = (p3b >>  0) & 0x3ff;
++            }
++            *du++ = (p3b >> 10) & 0x3ff;
++            *dv++ = (p3b >> 20) & 0x3ff;
++
++            if (((x += 8) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            *du++ = p3a & 0x3ff;
++            *dv++ = (p3a >> 10) & 0x3ff;
++            *du++ = (p3a >> 20) & 0x3ff;
++            *dv++ = p3b & 0x3ff;
++            *du++ = (p3b >> 10) & 0x3ff;
++            *dv++ = (p3b >> 20) & 0x3ff;
++
++            if (((x += 8) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            *du++ = p3a & 0x3ff;
++            *dv++ = (p3a >> 10) & 0x3ff;
++            if (xrem1 == 2)
++            {
++                *du++ = (p3a >> 20) & 0x3ff;
++                *dv++ = p3b & 0x3ff;
++            }
++        }
++    }
++}
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint8_t * d = dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = (p3 >> 2) & 0xff;
++            *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = (p3 >> 2) & 0xff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 12) & 0xff;
++        }
++    }
++}
++
++
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr)
++{
++    const unsigned int n = dst_stride1 / 2;
++    unsigned int j;
++
++    // This is true for our current layouts
++    av_assert0(dst_stride1 == src_stride1);
++
++    // As we have the same stride1 for src & dest and src is wider than dest
++    // then if we loop on src we can always write contiguously to dest
++    // We make no effort to copy an exact width - round up to nearest src stripe
++    // as we will always have storage in dest for that
++
++#if ARCH_ARM && HAVE_NEON
++    if (shr == 3 && src_stride1 == 128) {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++        }
++    }
++    else
++#endif
++    {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++                cpy16_to_8(d, s1, n, shr);
++                cpy16_to_8(d + n, s2, n, shr);
++            }
++        }
++    }
++
++    // Fix up a trailing dest half stripe
++    if (j < w) {
++        uint8_t * d = dst + j * dst_stride2;
++        const uint8_t * s1 = src + j * 2 * src_stride2;
++
++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++            cpy16_to_8(d, s1, n, shr);
++        }
++    }
++}
++
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
++{
++    const int w = av_frame_cropped_width(src);
++    const int h = av_frame_cropped_height(src);
++    const int x = src->crop_left;
++    const int y = src->crop_top;
++
++    // We will crop as part of the conversion
++    dst->crop_top = 0;
++    dst->crop_left = 0;
++    dst->crop_bottom = 0;
++    dst->crop_right = 0;
++
++    switch (src->format){
++        case AV_PIX_FMT_SAND128:
++        case AV_PIX_FMT_RPI4_8:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2,  w/2, h/2);
++                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_SAND64_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x*2, y, w*2, h);
++                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y/2,  w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_RPI4_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w/2, h/2);
++                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        default:
++            return -1;
++    }
++
++    return av_frame_copy_props(dst, src);
++}
+diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
+new file mode 100644
+index 0000000000..462ccb8abd
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,188 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr);
++
++
++// dst must contain required pixel format & allocated data buffers
++// Cropping on the src buffer will be honoured and dst crop will be set to zero
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++#ifdef RPI_ZC_SAND128_ONLY
++    // If we are sure we only only support 128 byte sand formats replace the
++    // var with a constant which should allow for better optimisation
++    return 128;
++#else
++    return frame->linesize[0];
++#endif
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
+diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
+index a9bf6ff9e0..6a0e2dcc09 100644
+--- a/libswscale/aarch64/rgb2rgb.c
++++ b/libswscale/aarch64/rgb2rgb.c
+@@ -30,6 +30,12 @@
+ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride);
++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ av_cold void rgb2rgb_init_aarch64(void)
+ {
+@@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void)
+ 
+     if (have_neon(cpu_flags)) {
+         interleaveBytes = ff_interleave_bytes_neon;
++        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
++        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
+     }
+ }
+diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
+index d81110ec57..476ca723a0 100644
+--- a/libswscale/aarch64/rgb2rgb_neon.S
++++ b/libswscale/aarch64/rgb2rgb_neon.S
+@@ -77,3 +77,448 @@ function ff_interleave_bytes_neon, export=1
+ 0:
+         ret
+ endfunc
++
++// void ff_rgb24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++function ff_rgb24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld1             {v3.s}[2], [x15], #4
++        ld1             {v3.s}[1], [x15], #4
++        ld1             {v3.s}[0], [x15], #4
++        ld1             {v4.s}[2], [x15], #4
++        ld1             {v4.s}[1], [x15], #4
++        ld1             {v4.s}[0], [x15], #4
++        ld1             {v5.s}[2], [x15], #4
++        ld1             {v5.s}[1], [x15], #4
++        ld1             {v5.s}[0], [x15]
++        b               99f
++endfunc
++
++// void ff_bgr24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++// regs
++// v0-2         Src bytes - reused as chroma src
++// v3-5         Coeffs (packed very inefficiently - could be squashed)
++// v6           128b
++// v7           128h
++// v8-15        Reserved
++// v16-18       Lo Src expanded as H
++// v19          -
++// v20-22       Hi Src expanded as H
++// v23          -
++// v24          U out
++// v25          U tmp
++// v26          Y out
++// v27-29       Y tmp
++// v30          V out
++// v31          V tmp
++
++// Assumes Little Endian in tail stores & conversion matrix
++
++function ff_bgr24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[2], [x15]
++99:
++        ldr             w14, [sp, #0]
++        movi            v7.8b, #128
++        uxtl            v6.8h, v7.8b
++        // Ensure if nothing to do then we do nothing
++        cmp             w4, #0
++        b.le            90f
++        cmp             w5, #0
++        b.le            90f
++        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
++        // the remainder done in the tail
++        tst             w4, #15
++        b.eq            1f
++        sub             w4, w4, #16
++1:
++
++// -------------------- Even line body - YUV
++11:
++        subs            w9,  w4, #0
++        mov             x10, x0
++        mov             x11, x1
++        mov             x12, x2
++        mov             x13, x3
++        b.lt            12f
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
++
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        subs            w9, w9, #16
++
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++
++        b.gt            10b
++
++// -------------------- Even line tail - YUV
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
++
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        cmp             w9, #-16
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++        st1             {v24.s}[0],  [x12], #4
++        st1             {v30.s}[0],  [x13], #4
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++        st1             {v24.h}[2],  [x12], #2
++        st1             {v30.h}[2],  [x13], #2
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++        st1             {v24.b}[6],  [x12], #1
++        st1             {v30.b}[6],  [x13], #1
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++        st1             {v24.b}[7],  [x12]
++        st1             {v30.b}[7],  [x13]
++1:
++3:
++
++// -------------------- Odd line body - Y only
++
++        subs            w5, w5, #1
++        b.eq            90f
++
++        subs            w9,  w4, #0
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        mov             x10, x0
++        mov             x11, x1
++        b.lt            12f
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        subs            w9, w9, #16
++
++        st1             {v26.16b}, [x11], #16
++
++        b.gt            10b
++
++// -------------------- Odd line tail - Y
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        cmp             w9, #-16
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++1:
++3:
++
++// ------------------- Loop to start
++
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        add             x2, x2, w7, SXTX
++        add             x3, x3, w7, SXTX
++        subs            w5, w5, #1
++        b.gt            11b
++90:
++        ret
++endfunc
+diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
+index e98fdac8ea..c3b9079d2b 100644
+--- a/libswscale/rgb2rgb.c
++++ b/libswscale/rgb2rgb.c
+@@ -83,6 +83,31 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
+                        int width, int height,
+                        int lumStride, int chromStride, int srcStride,
+                        int32_t *rgb2yuv);
++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
++                       uint8_t *udst, uint8_t *vdst,
++                       int width, int height,
++                       int lumStride, int chromStride, int srcStride,
++                       int32_t *rgb2yuv);
++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
+ void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                  int srcStride, int dstStride);
+ void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
+index f3951d523e..a0dd3ffb79 100644
+--- a/libswscale/rgb2rgb.h
++++ b/libswscale/rgb2rgb.h
+@@ -79,6 +79,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
+ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height, int lumStride,
+                       int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                      uint8_t *vdst, int width, int height, int lumStride,
++                      int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ /**
+  * Height should be a multiple of 2 and width should be a multiple of 16.
+@@ -128,6 +131,26 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                               int width, int height,
+                               int lumStride, int chromStride, int srcStride,
+                               int32_t *rgb2yuv);
++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                              int width, int height,
++                              int lumStride, int chromStride, int srcStride,
++                              int32_t *rgb2yuv);
++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
+ extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                         int srcStride, int dstStride);
+ 
+diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
+index 42c69801ba..e711589e1e 100644
+--- a/libswscale/rgb2rgb_template.c
++++ b/libswscale/rgb2rgb_template.c
+@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
+  * others are ignored in the C version.
+  * FIXME: Write HQ version.
+  */
+-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+-                   int chromStride, int srcStride, int32_t *rgb2yuv)
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
+ {
+-    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+-    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+-    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
+     int y;
+     const int chromWidth = width >> 1;
+ 
+@@ -678,6 +679,19 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
+         ydst += lumStride;
+         src  += srcStride;
+ 
+@@ -700,6 +714,125 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
++        udst += chromStride;
++        vdst += chromStride;
++        ydst += lumStride;
++        src  += srcStride;
++    }
++}
++
++static const uint8_t x_rgb[9] = {
++    RY_IDX, GY_IDX, BY_IDX,
++    RU_IDX, GU_IDX, BU_IDX,
++    RV_IDX, GV_IDX, BV_IDX,
++};
++
++static const uint8_t x_bgr[9] = {
++     BY_IDX, GY_IDX, RY_IDX,
++     BU_IDX, GU_IDX, RU_IDX,
++     BV_IDX, GV_IDX, RV_IDX,
++};
++
++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
++{
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
++    int y;
++    const int chromWidth = width >> 1;
++
++    for (y = 0; y < height; y += 2) {
++        int i;
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
++        ydst += lumStride;
++        src  += srcStride;
++
++        if (y+1 == height)
++            break;
++
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
+         udst += chromStride;
+         vdst += chromStride;
+         ydst += lumStride;
+@@ -707,6 +840,37 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+     }
+ }
+ 
++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++// As the general code does no SIMD-like ops simply adding 1 to the src address
++// will fix the ignored alpha position
++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++
+ static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride)
+@@ -980,6 +1144,11 @@ static av_cold void rgb2rgb_init_c(void)
+     yuy2toyv12         = yuy2toyv12_c;
+     planar2x           = planar2x_c;
+     ff_rgb24toyv12     = ff_rgb24toyv12_c;
++    ff_bgr24toyv12     = ff_bgr24toyv12_c;
++    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
++    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
++    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
++    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
+     interleaveBytes    = interleaveBytes_c;
+     deinterleaveBytes  = deinterleaveBytes_c;
+     vu9_to_vu12        = vu9_to_vu12_c;
+diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
+index 9af2e7ecc3..52469b2e4a 100644
+--- a/libswscale/swscale_unscaled.c
++++ b/libswscale/swscale_unscaled.c
+@@ -1654,6 +1654,91 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+     return srcSliceH;
+ }
+ 
++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                              int srcStride[], int srcSliceY, int srcSliceH,
++                              uint8_t *dst[], int dstStride[])
++{
++    ff_bgr24toyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_bgrxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_rgbxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xbgrtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xrgbtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
+ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+@@ -1977,7 +2062,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
+     const enum AVPixelFormat dstFormat = c->dstFormat;
+     const int flags = c->flags;
+     const int dstH = c->dstH;
+-    const int dstW = c->dstW;
+     int needsDither;
+ 
+     needsDither = isAnyRGB(dstFormat) &&
+@@ -2035,8 +2119,34 @@ void ff_get_unscaled_swscale(SwsContext *c)
+     /* bgr24toYV12 */
+     if (srcFormat == AV_PIX_FMT_BGR24 &&
+         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+-        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
++        !(flags & SWS_ACCURATE_RND))
+         c->convert_unscaled = bgr24ToYv12Wrapper;
++    /* rgb24toYV12 */
++    if (srcFormat == AV_PIX_FMT_RGB24 &&
++        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->convert_unscaled = rgb24ToYv12Wrapper;
++
++    /* bgrxtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->convert_unscaled = bgrxToYv12Wrapper;
++    /* rgbx24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->convert_unscaled = rgbxToYv12Wrapper;
++    /* xbgrtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->convert_unscaled = xbgrToYv12Wrapper;
++    /* xrgb24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->convert_unscaled = xrgbToYv12Wrapper;
+ 
+     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
+diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
+index 6c38041ddb..12776ffec7 100644
+--- a/libswscale/tests/swscale.c
++++ b/libswscale/tests/swscale.c
+@@ -23,6 +23,7 @@
+ #include <string.h>
+ #include <inttypes.h>
+ #include <stdarg.h>
++#include <time.h>
+ 
+ #undef HAVE_AV_CONFIG_H
+ #include "libavutil/cpu.h"
+@@ -78,6 +79,15 @@ struct Results {
+     uint32_t crc;
+ };
+ 
++static int time_rep = 0;
++
++static uint64_t utime(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
++}
++
+ // test by ref -> src -> dst -> out & compare out against ref
+ // ref & out are YV12
+ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+         goto end;
+     }
+ 
+-    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
++    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
+            desc_src->name, srcW, srcH,
+            desc_dst->name, dstW, dstH,
+            flags);
+@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+ 
+     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+ 
++    if (time_rep != 0)
++    {
++        const uint64_t now = utime();
++        uint64_t done;
++        for (i = 1; i != time_rep; ++i) {
++            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
++        }
++        done = utime();
++        printf(" T=%7"PRId64"us ", done-now);
++    }
++
+     for (i = 0; i < 4 && dstStride[i]; i++)
+         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+                      dstStride[i] * dstH);
+@@ -355,56 +376,78 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
+     return 0;
+ }
+ 
+-#define W 96
+-#define H 96
+-
+ int main(int argc, char **argv)
+ {
++    unsigned int W = 96;
++    unsigned int H = 96;
++    unsigned int W2;
++    unsigned int H2;
++    unsigned int S;
+     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
+     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
+-    uint8_t *rgb_data   = av_malloc(W * H * 4);
+-    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
+-    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
+-    uint8_t *data       = av_malloc(4 * W * H);
+-    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+-    int stride[4]       = { W, W, W, W };
+     int x, y;
+     struct SwsContext *sws;
+     AVLFG rand;
+     int res = -1;
+     int i;
+     FILE *fp = NULL;
+-
+-    if (!rgb_data || !data)
+-        return -1;
++    uint8_t *rgb_data;
++    uint8_t * rgb_src[4] = { NULL };
++    int rgb_stride[4]   = { 0 };
++    uint8_t *data;
++    uint8_t * src[4] = { NULL };
++    int stride[4]       = { 0 };
+ 
+     for (i = 1; i < argc; i += 2) {
++        const char * const arg2 = argv[i+1];
++
+         if (argv[i][0] != '-' || i + 1 == argc)
+             goto bad_option;
+         if (!strcmp(argv[i], "-ref")) {
+-            fp = fopen(argv[i + 1], "r");
++            fp = fopen(arg2, "r");
+             if (!fp) {
+-                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
++                fprintf(stderr, "could not open '%s'\n", arg2);
+                 goto error;
+             }
+         } else if (!strcmp(argv[i], "-cpuflags")) {
+             unsigned flags = av_get_cpu_flags();
+-            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
++            int ret = av_parse_cpu_caps(&flags, arg2);
+             if (ret < 0) {
+-                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid cpu flags %s\n", arg2);
+                 return ret;
+             }
+             av_force_cpu_flags(flags);
+         } else if (!strcmp(argv[i], "-src")) {
+-            srcFormat = av_get_pix_fmt(argv[i + 1]);
++            srcFormat = av_get_pix_fmt(arg2);
+             if (srcFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
+                 return -1;
+             }
+         } else if (!strcmp(argv[i], "-dst")) {
+-            dstFormat = av_get_pix_fmt(argv[i + 1]);
++            dstFormat = av_get_pix_fmt(arg2);
+             if (dstFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-w")) {
++            char * p = NULL;
++            W = strtoul(arg2, &p, 0);
++            if (!W || *p) {
++                fprintf(stderr, "bad width %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-h")) {
++            char * p = NULL;
++            H = strtoul(arg2, &p, 0);
++            if (!H || *p) {
++                fprintf(stderr, "bad height '%s'\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-t")) {
++            char * p = NULL;
++            time_rep = (int)strtol(arg2, &p, 0);
++            if (*p) {
++                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
+                 return -1;
+             }
+         } else {
+@@ -414,15 +457,34 @@ bad_option:
+         }
+     }
+ 
+-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
++    S = (W + 15) & ~15;
++    rgb_data   = av_mallocz(S * H * 4);
++    rgb_src[0] = rgb_data;
++    rgb_stride[0]   = 4 * S;
++    data       = av_mallocz(4 * S * H);
++    src[0] = data;
++    src[1] = data + S * H;
++    src[2] = data + S * H * 2;
++    src[3] = data + S * H * 3;
++    stride[0] = S;
++    stride[1] = S;
++    stride[2] = S;
++    stride[3] = S;
++    H2 = H < 96 ? 8 : H / 12;
++    W2 = W < 96 ? 8 : W / 12;
++
++    if (!rgb_data || !data)
++        return -1;
++
++    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
+                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+ 
+     av_lfg_init(&rand, 1);
+ 
+     for (y = 0; y < H; y++)
+         for (x = 0; x < W * 4; x++)
+-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+-    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
++            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
++    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
+     if (res < 0 || res != H) {
+         res = -1;
+         goto error;
+@@ -431,10 +493,10 @@ bad_option:
+     av_free(rgb_data);
+ 
+     if(fp) {
+-        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
++        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
+         fclose(fp);
+     } else {
+-        selfTest(src, stride, W, H, srcFormat, dstFormat);
++        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
+         res = 0;
+     }
+ error:
+diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
+new file mode 100644
+index 0000000000..2b62d660c0
+--- /dev/null
++++ b/pi-util/BUILD.txt
+@@ -0,0 +1,67 @@
++Building Pi FFmpeg
++==================
++
++Current only building on a Pi is supported.
++This builds ffmpeg the way I've tested it
++
++Get all dependencies - the current package dependencies are good enough
++
++$ sudo apt-get build-dep ffmpeg
++
++Configure using the pi-util/conf_native.sh script
++-------------------------------------------------
++
++This sets the normal release options and creates an ouutput dir to build into
++The directory name will depend on system and options but will be under out/
++
++There are a few choices here
++ --mmal  build including the legacy mmal-based decoders and zero-copy code
++         this requires appropriate libraries which currently will exist for
++         armv7 but not arm64
++ --noshared
++         Build a static image rather than a shared library one.  Static is
++         easier for testing as there is no need to worry about library
++         paths being confused and therefore running the wrong code,  Shared
++         is what is needed, in most cases, when building for use by other
++         programs.
++ --usr   Set install dir to /usr (i.e. system default) rather than in
++         <builddir>/install
++
++So for a static build
++---------------------
++
++$ pi-util/conf_native.sh --noshared
++
++$ make -j8 -C out/<wherever the script said it was building to>
++
++You can now run ffmpeg directly from where it was built
++
++For a shared build
++------------------
++
++There are two choices here
++
++$ pi-util/conf_native.sh
++$ make -j8 -C out/<builddir> install
++
++This sets the install prefix to <builddir>/install and is probably what you
++want if you don't want to overwrite the system files.
++
++You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
++built. You can copy the contents of <build dir>/install to /usr and that mostly
++works. The only downside is that paths in pkgconfig end up being set to the
++install directory in your build directory which may be less than ideal when
++building other packages.
++
++The alternative if you just want to replace the system libs is:
++
++$ pi-util/conf_native.sh --usr
++$ make -j8 -C out/<builddir>
++$ sudo pi-util/clean_usr_libs.sh
++$ sudo make -j8 -C out/<builddir> install
++
++The clean_usr_libs.sh step wipes any existing libs & includes (for all
++architectures) from the system which helps avoid confusion when running other
++progs as you can be sure you're not running old code which is unfortunately
++easy to do otherwise.
++
+diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
+new file mode 100644
+index 0000000000..fcce72226a
+--- /dev/null
++++ b/pi-util/NOTES.txt
+@@ -0,0 +1,69 @@
++Notes on the hevc_rpi decoder & associated support code
++-------------------------------------------------------
++
++There are 3 main parts to the existing code:
++
++1) The decoder - this is all in libavcodec as rpi_hevc*.
++
++2) A few filters to deal with Sand frames and a small patch to
++automatically select the sand->i420 converter when required.
++
++3) A kludge in ffmpeg.c to display the decoded video. This could & should
++be converted into a proper ffmpeg display module.
++
++
++Decoder
++-------
++
++The decoder is a modified version of the existing ffmpeg hevc decoder.
++Generally it is ~100% faster than the existing ffmpeg hevc s/w decoder.
++More complex bitstreams can be up to ~200% faster but particularly easy
++streams can cut its advantage down to ~50%.  This means that a Pi3+ can
++display nearly all 8-bit 1080p30 streams and with some overclocking it can
++display most lower bitrate 10-bit 1080p30 streams - this latter case is
++not helped by the requirement to downsample to 8-bit before display on a
++Pi.
++
++It has had co-processor offload added for inter-pred and large block
++residual transform.  Various parts have had optimized ARM NEON assembler
++added and the existing ARM asm sections have been profiled and
++re-optimized for A53. The main C code has been substantially reworked at
++its lower levels in an attempt to optimize it and minimize memory
++bandwidth. To some extent code paths that deal with frame types that it
++doesn't support have been pruned.
++
++It outputs frames in Broadcom Sand format. This is a somewhat annoying
++layout that doesn't fit into ffmpegs standard frame descriptions. It has
++vertical stripes of 128 horizontal pixels (64 in 10 bit forms) with Y for
++the stripe followed by interleaved U & V, that is then followed by the Y
++for the next stripe, etc. The final stripe is always padded to
++stripe-width. This is used in an attempt to help with cache locality and
++cut down on the number of dram bank switches. It is annoying to use for
++inter-pred with conventional processing but the way the Pi QPU (which is
++used for inter-pred) works means that it has negligible downsides here and
++the improved memory performance exceeds the overhead of the increased
++complexity in the rest of the code.
++
++Frames must be allocated out of GPU memory (as otherwise they can't be
++accessed by the co-processors). Utility functions (in rpi_zc.c) have been
++written to make this easier. As the frames are already in GPU memory they
++can be displayed by the Pi h/w without any further copying.
++
++
++Known non-features
++------------------
++
++Frame allocation should probably be done in some other way in order to fit
++into the standard framework better.
++
++Sand frames are currently declared as software frames, there is an
++argument that they should be hardware frames but they aren't really.
++
++There must be a better way of auto-selecting the hevc_rpi decoder over the
++normal s/w hevc decoder, but I became confused by the existing h/w
++acceleration framework and what I wanted to do didn't seem to fit in
++neatly.
++
++Display should be a proper device rather than a kludge in ffmpeg.c
++
++
+diff --git a/pi-util/TESTMESA.txt b/pi-util/TESTMESA.txt
+new file mode 100644
+index 0000000000..92bc13a3df
+--- /dev/null
++++ b/pi-util/TESTMESA.txt
+@@ -0,0 +1,82 @@
++# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
++
++# These assume that the drm_mmal test for Sand8 has been built on this Pi
++# as build relies on many of the same files
++
++# 1st get everything required to build ffmpeg
++# If sources aren't already enabled on your Pi then enable them
++sudo su
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
++mv /tmp/sources.list /etc/apt/
++mv /tmp/raspi.list /etc/apt/sources.list.d/
++apt update
++
++# Get dependancies
++sudo apt build-dep ffmpeg
++
++sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
++
++# Enable H265 V4L2 request decoder
++sudo su
++echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
++# You may also want to add more CMA if you are going to try 4k videos
++# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
++# dtoverlay=vc4-fkms-v3d,cma-512
++reboot
++# Check it has turned up
++ls -la /dev/video*
++# This should include video19
++# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
++
++# Currently on the Pi the linux headers from the debian distro don't match
++# the kernel that we ship and we need to update them - hopefully this step
++# will be unneeded in the future
++sudo apt install git bc bison flex libssl-dev make
++git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
++cd linux
++KERNEL=kernel7l
++make bcm2711_defconfig
++make headers_install
++sudo cp -r usr/include/linux /usr/include
++cd ..
++
++# Config - this builds a staticly linked ffmpeg which is easier for testing
++pi-util/conf_native.sh --noshared
++
++# Build (this is a bit dull)
++# If you want to poke the source the libavdevice/egl_vout.c contains the
++# output code -
++cd out/armv7-static-rel
++
++# Check that you have actually configured V4L2 request
++grep HEVC_V4L2REQUEST config.h
++# You are hoping for
++# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
++# if you get 0 then the config has failed
++
++make -j6
++
++# Grab test streams
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
++
++# Test i420 output (works currently)
++./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
++
++# Test Sand8 output - doesn't currently work but should once you have
++# Sand8 working in drm_mmal. I can't guarantee that this will work as
++# I can't test this path with a known working format, but the debug looks
++# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
++# The "show_all 1" forces vout to display every frame otherwise it drops any
++# frame that would cause it to block
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
++
++# Test Sand30 - doesn't currently work
++# (Beware that when FFmpeg errors out it often leaves your teminal window
++# in a state where you need to reset it)
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
++
++
++
+diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
+new file mode 100755
+index 0000000000..01bd6a6a22
+--- /dev/null
++++ b/pi-util/clean_usr_libs.sh
+@@ -0,0 +1,42 @@
++set -e
++U=/usr/include/arm-linux-gnueabihf
++rm -rf $U/libavcodec
++rm -rf $U/libavdevice
++rm -rf $U/libavfilter
++rm -rf $U/libavformat
++rm -rf $U/libavutil
++rm -rf $U/libswresample
++rm -rf $U/libswscale
++U=/usr/include/aarch64-linux-gnu
++rm -rf $U/libavcodec
++rm -rf $U/libavdevice
++rm -rf $U/libavfilter
++rm -rf $U/libavformat
++rm -rf $U/libavutil
++rm -rf $U/libswresample
++rm -rf $U/libswscale
++U=/usr/lib/arm-linux-gnueabihf
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++U=/usr/lib/arm-linux-gnueabihf/neon/vfp
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++U=/usr/lib/aarch64-linux-gnu
++rm -f $U/libavcodec.*
++rm -f $U/libavdevice.*
++rm -f $U/libavfilter.*
++rm -f $U/libavformat.*
++rm -f $U/libavutil.*
++rm -f $U/libswresample.*
++rm -f $U/libswscale.*
++
+diff --git a/pi-util/conf_arm64_native.sh b/pi-util/conf_arm64_native.sh
+new file mode 100644
+index 0000000000..9e3bbfa190
+--- /dev/null
++++ b/pi-util/conf_arm64_native.sh
+@@ -0,0 +1,45 @@
++echo "Configure for ARM64 native build"
++
++#RPI_KEEPS="-save-temps=obj"
++
++SHARED_LIBS="--enable-shared"
++if [ "$1" == "--noshared" ]; then
++  SHARED_LIBS="--disable-shared"
++  echo Static libs
++  OUT=out/arm64-static-rel
++else
++  echo Shared libs
++  OUT=out/arm64-shared-rel
++fi
++
++mkdir -p $OUT
++cd $OUT
++
++A=aarch64-linux-gnu
++USR_PREFIX=`pwd`/install
++LIB_PREFIX=$USR_PREFIX/lib/$A
++INC_PREFIX=$USR_PREFIX/include/$A
++
++../../configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ --disable-stripping\
++ --disable-thumb\
++ --disable-mmal\
++ --enable-sand\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-epoxy\
++ --enable-libudev\
++ --enable-vout-drm\
++ --enable-vout-egl\
++ $SHARED_LIBS\
++ --extra-cflags="-ggdb"
++
++# --enable-decoder=hevc_rpi\
++# --enable-extra-warnings\
++# --arch=armv71\
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
+new file mode 100644
+index 0000000000..4efd5d1c67
+--- /dev/null
++++ b/pi-util/conf_h265.2016.csv
+@@ -0,0 +1,195 @@
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
+diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
+new file mode 100644
+index 0000000000..6082641271
+--- /dev/null
++++ b/pi-util/conf_h265.2016_HEVC_v1.csv
+@@ -0,0 +1,147 @@
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
+new file mode 100644
+index 0000000000..fc14f2a3c2
+--- /dev/null
++++ b/pi-util/conf_h265.csv
+@@ -0,0 +1,144 @@
++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
+diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
+new file mode 100755
+index 0000000000..1dbbcf154a
+--- /dev/null
++++ b/pi-util/conf_native.sh
+@@ -0,0 +1,157 @@
++echo "Configure for native build"
++
++FFSRC=`pwd`
++MC=`dpkg --print-architecture`
++BUILDBASE=$FFSRC/out
++
++#RPI_KEEPS="-save-temps=obj"
++RPI_KEEPS=""
++
++NOSHARED=
++MMAL=
++USR_PREFIX=
++DO_MAKE=
++DO_INSTALL=
++INSTALL_SUDO=
++
++while [ "$1" != "" ] ; do
++    case $1 in
++	--noshared)
++	    NOSHARED=1
++	    ;;
++	--mmal)
++	    MMAL=1
++	    ;;
++	--usr)
++	    INSTALL_SUDO=1
++	    USR_PREFIX=/usr
++	    ;;
++	--make)
++	    DO_MAKE=1
++	    ;;
++        --install)
++	    DO_MAKE=1
++	    DO_INSTALL=1
++	    ;;
++	*)
++	    echo "Usage $0: [--noshared] [--mmal] [--usr]"
++	    echo "  noshared  Build static libs and executable - good for testing"
++	    echo "  mmal      Build mmal decoders"
++	    echo "  usr       Set install prefix to /usr [default=<build-dir>/install]"
++	    echo "  make      Make after configure"
++	    echo "  install   Make & install after configure - does sudo on install if --usr"
++	    exit 1
++	    ;;
++    esac
++    shift
++done
++
++
++MCOPTS=
++RPI_INCLUDES=
++RPI_LIBDIRS=
++RPI_DEFINES=
++RPI_EXTRALIBS=
++
++# uname -m gives kernel type which may not have the same
++# 32/64bitness as userspace :-( getconf shoudl provide the answer
++# but use uname to check we are on the right processor
++MC=`uname -m`
++LB=`getconf LONG_BIT`
++if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then
++  if [ "$LB" == "32" ]; then
++    echo "M/C armv7"
++    A=arm-linux-gnueabihf
++    B=armv7
++    MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
++    RPI_DEFINES=-mfpu=neon-vfpv4
++  elif [ "$LB" == "64" ]; then
++    echo "M/C aarch64"
++    A=aarch64-linux-gnu
++    B=arm64
++  else
++    echo "Unknown LONG_BIT name: $LB"
++    exit 1
++  fi
++else
++  echo "Unknown machine name: $MC"
++  exit 1
++fi
++
++if [ $MMAL ]; then
++  RPI_OPT_VC=/opt/vc
++  RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
++  RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
++  RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
++  RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
++  RPIOPTS="--enable-mmal"
++else
++  RPIOPTS="--disable-mmal"
++fi
++
++C=`lsb_release -sc`
++V=`cat RELEASE`
++
++SHARED_LIBS="--enable-shared"
++if [ $NOSHARED ]; then
++  SHARED_LIBS="--disable-shared"
++  OUT=$BUILDBASE/$B-$C-$V-static-rel
++  echo Static libs
++else
++  echo Shared libs
++  OUT=$BUILDBASE/$B-$C-$V-shared-rel
++fi
++
++if [ ! $USR_PREFIX ]; then
++  USR_PREFIX=$OUT/install
++fi
++LIB_PREFIX=$USR_PREFIX/lib/$A
++INC_PREFIX=$USR_PREFIX/include/$A
++
++echo Destination directory: $OUT
++mkdir -p $OUT
++# Nothing under here need worry git - including this .gitignore!
++echo "**" > $BUILDBASE/.gitignore
++cd $OUT
++
++$FFSRC/configure \
++ --prefix=$USR_PREFIX\
++ --libdir=$LIB_PREFIX\
++ --incdir=$INC_PREFIX\
++ $MCOPTS\
++ --disable-stripping\
++ --disable-thumb\
++ --enable-sand\
++ --enable-v4l2-request\
++ --enable-libdrm\
++ --enable-vout-egl\
++ --enable-vout-drm\
++ --enable-gpl\
++ $SHARED_LIBS\
++ $RPIOPTS\
++ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
++ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
++ --extra-ldflags="$RPI_LIBDIRS"\
++ --extra-libs="$RPI_EXTRALIBS"\
++ --extra-version="rpi"
++
++echo "Configured into $OUT"
++
++if [ $DO_MAKE ]; then
++  echo "Making..."
++  make -j8
++  echo "Made"
++fi
++if [ $DO_INSTALL ]; then
++  echo "Installing..."
++  if [ $INSTALL_SUDO ]; then
++    sudo make -j8 install
++  else
++    make -j8 install
++  fi
++  echo "Installed"
++fi
++
++
++# gcc option for getting asm listing
++# -Wa,-ahls
+diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
+new file mode 100755
+index 0000000000..657568014e
+--- /dev/null
++++ b/pi-util/ffconf.py
+@@ -0,0 +1,215 @@
++#!/usr/bin/env python3
++
++import string
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++CODEC_HEVC_RPI  = 1
++HWACCEL_RPI     = 2
++HWACCEL_DRM     = 3
++HWACCEL_VAAPI   = 4
++
++def testone(fileroot, srcname, es_file, md5_file, pix, dectype, vcodec, ffmpeg_exec):
++    hwaccel = ""
++    if dectype == HWACCEL_RPI:
++        hwaccel = "rpi"
++    elif dectype == HWACCEL_DRM:
++        hwaccel = "drm"
++    elif dectype == HWACCEL_VAAPI:
++        hwaccel = "vaapi"
++
++    pix_fmt = []
++    if pix == "8":
++        pix_fmt = ["-pix_fmt", "yuv420p"]
++    elif pix == "10":
++        pix_fmt = ["-pix_fmt", "yuv420p10le"]
++    elif pix == "12":
++        pix_fmt = ["-pix_fmt", "yuv420p12le"]
++
++    tmp_root = "/tmp"
++
++    names = srcname.split('/')
++    while len(names) > 1:
++        tmp_root = os.path.join(tmp_root, names[0])
++        del names[0]
++    name = names[0]
++
++    if not os.path.exists(tmp_root):
++        os.makedirs(tmp_root)
++
++    dec_file = os.path.join(tmp_root, name + ".dec.md5")
++    try:
++        os.remove(dec_file)
++    except:
++        pass
++
++    flog = open(os.path.join(tmp_root, name + ".log"), "wt")
++
++    ffargs = [ffmpeg_exec, "-flags", "unaligned", "-hwaccel", hwaccel, "-vcodec", "hevc", "-i", os.path.join(fileroot, es_file)] + pix_fmt + ["-f", "md5", dec_file]
++
++    # Unaligned needed for cropping conformance
++    if hwaccel:
++        rstr = subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT)
++    else:
++        rstr = subprocess.call(
++            [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
++            stdout=flog, stderr=subprocess.STDOUT)
++
++    try:
++        m1 = None
++        m2 = None
++        with open(os.path.join(fileroot, md5_file)) as f:
++            for line in f:
++                m1 = re.search("[0-9a-f]{32}", line.lower())
++                if m1:
++                    break
++
++        with open(dec_file) as f:
++            m2 = re.search("[0-9a-f]{32}", f.readline())
++    except:
++        pass
++
++    if  m1 and m2 and m1.group() == m2.group():
++        print("Match: " + m1.group(), file=flog)
++        rv = 0
++    elif not m1:
++        print("****** Cannot find m1", file=flog)
++        rv = 3
++    elif not m2:
++        print("****** Cannot find m2", file=flog)
++        rv = 2
++    else:
++        print("****** Mismatch: " + m1.group() + " != " + m2.group(), file=flog)
++        rv = 1
++    flog.close()
++    return rv
++
++def scandir(root):
++    aconf = []
++    ents = os.listdir(root)
++    ents.sort(key=str.lower)
++    for name in ents:
++        test_path = os.path.join(root, name)
++        if S_ISDIR(os.stat(test_path).st_mode):
++            files = os.listdir(test_path)
++            es_file = "?"
++            md5_file = "?"
++            for f in files:
++                (base, ext) = os.path.splitext(f)
++                if base[0] == '.':
++                    pass
++                elif ext == ".bit" or ext == ".bin":
++                    es_file = f
++                elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
++                    if md5_file == "?":
++                        md5_file = f
++                    elif base[-3:] == "yuv":
++                        md5_file = f
++            aconf.append((1, name, es_file, md5_file))
++    return aconf
++
++def runtest(name, tests):
++    if not tests:
++        return True
++    for t in tests:
++        if name[0:len(t)] == t or name.find("/" + t) != -1:
++            return True
++    return False
++
++def doconf(csva, tests, test_root, vcodec, dectype, ffmpeg_exec):
++    unx_failures = []
++    unx_success = []
++    failures = 0
++    successes = 0
++    for a in csva:
++        exp_test = int(a[0])
++        if (exp_test and runtest(a[1], tests)):
++            name = a[1]
++            print ("==== ", name, end="")
++            sys.stdout.flush()
++
++            rv = testone(os.path.join(test_root, name), name, a[2], a[3], a[4], dectype=dectype, vcodec=vcodec, ffmpeg_exec=ffmpeg_exec)
++            if (rv == 0):
++                successes += 1
++            else:
++                failures += 1
++
++            if (rv == 0):
++                if exp_test == 2:
++                    print(": * OK *")
++                    unx_success.append(name)
++                else:
++                    print(": ok")
++            elif exp_test == 2 and rv == 1:
++                print(": fail")
++            elif exp_test == 3 and rv == 2:
++                # Call an expected "crash" an abort
++                print(": abort")
++            else:
++                unx_failures.append(name)
++                if rv == 1:
++                    print(": * FAIL *")
++                elif (rv == 2) :
++                    print(": * CRASH *")
++                elif (rv == 3) :
++                    print(": * MD5 MISSING *")
++                else :
++                    print(": * BANG *")
++
++    if unx_failures or unx_success:
++        print("Unexpected Failures:", unx_failures)
++        print("Unexpected Success: ", unx_success)
++    else:
++        print("All tests normal:", successes, "ok,", failures, "failed")
++
++
++class ConfCSVDialect(csv.Dialect):
++    delimiter = ','
++    doublequote = True
++    lineterminator = '\n'
++    quotechar='"'
++    quoting = csv.QUOTE_MINIMAL
++    skipinitialspace = True
++    strict = True
++
++if __name__ == '__main__':
++
++    argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
++    argp.add_argument("tests", nargs='*')
++    argp.add_argument("--pi4", action='store_true', help="Force pi4 cmd line")
++    argp.add_argument("--drm", action='store_true', help="Force v4l2 drm cmd line")
++    argp.add_argument("--vaapi", action='store_true', help="Force vaapi cmd line")
++    argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
++    argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
++    argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
++    argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
++    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
++    args = argp.parse_args()
++
++    if args.csvgen:
++        csv.writer(sys.stdout).writerows(scandir(args.test_root))
++        exit(0)
++
++    with open(args.csv, 'rt') as csvfile:
++        csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
++
++    dectype = CODEC_HEVC_RPI
++    if os.path.exists("/dev/rpivid-hevcmem"):
++        dectype = HWACCEL_RPI
++    if args.drm or os.path.exists("/sys/module/rpivid_hevc"):
++        dectype = HWACCEL_DRM
++
++    if args.pi4:
++        dectype = HWACCEL_RPI
++    elif args.drm:
++        dectype = HWACCEL_DRM
++    elif args.vaapi:
++        dectype = HWACCEL_VAAPI
++
++    doconf(csva, args.tests, args.test_root, args.vcodec, dectype, args.ffmpeg)
++
+diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
+new file mode 100755
+index 0000000000..65c5224cd8
+--- /dev/null
++++ b/pi-util/ffperf.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix, ffmpeg="./ffmpeg"):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen([ffmpeg, "-no_cvt_hw",
++                                  "-vcodec", "hevc_rpi",
++                                  "-t", "30", "-i", prefix + name,
++                                  "-f", "vout_rpi", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
++
++def main():
++    global flog
++
++    argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
++To blank the screen before starting use "xdg-screensaver activate"
++(For some reason this doesn't seem to work from within python).
++""")
++
++    argp.add_argument("streams", nargs='*')
++    argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
++    argp.add_argument("--csv_in", help="CSV input filename")
++    argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
++    argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
++    argp.add_argument("--ffmpeg", default="./ffmpeg", help="FFmpeg executable")
++
++    args = argp.parse_args()
++
++    csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
++    csv_out.writeheader()
++
++    stats_in = {}
++    if args.csv_in != None:
++        with open(args.csv_in, 'r', newline='') as f_in:
++            stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++    flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
++
++    streams = args.streams
++    if not streams:
++        if not stats_in:
++            print ("No source streams specified")
++            return 1
++        prefix = "" if args.prefix == None else args.prefix
++        streams = [k for k in stats_in]
++    elif args.prefix != None:
++        prefix = args.prefix
++    else:
++        prefix = streams[0]
++        for f in streams[1:]:
++            prefix = common_prefix(prefix, f)
++        pp = prefix.rpartition(os.sep)
++        prefix = pp[0] + pp[1]
++        streams = [s[len(prefix):] for s in streams]
++
++    for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
++        print ("====", f)
++
++        t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
++        for i in range(args.repeat):
++            t = tstats.time_file(f, prefix, args.ffmpeg)
++            print ("...", t.times_str())
++            if t0 > t:
++                t0 = t
++
++        if t0.name in stats_in:
++            pstat = stats_in[t0.name]
++            print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
++
++        csv_out.writerow(t0.dict())
++
++        print ()
++
++    return 0
++
++
++if __name__ == '__main__':
++    exit(main())
++
+diff --git a/pi-util/genpatch.sh b/pi-util/genpatch.sh
+new file mode 100755
+index 0000000000..0948a68a7a
+--- /dev/null
++++ b/pi-util/genpatch.sh
+@@ -0,0 +1,35 @@
++set -e
++
++NOPATCH=
++if [ "$1" == "--notag" ]; then
++  shift
++  NOPATCH=1
++fi
++
++if [ "$1" == "" ]; then
++  echo Usage: $0 [--notag] \<patch_tag\>
++  echo e.g.: $0 mmal_4
++  exit 1
++fi
++
++VERSION=`cat RELEASE`
++if [ "$VERSION" == "" ]; then
++  echo Can\'t find version RELEASE
++  exit 1
++fi
++
++PATCHFILE=../ffmpeg-$VERSION-$1.patch
++
++if [ $NOPATCH ]; then
++  echo Not tagged
++else
++  # Only continue if we are all comitted
++  git diff --name-status --exit-code
++
++  PATCHTAG=pi/$VERSION/$1
++  echo Tagging: $PATCHTAG
++
++  git tag $PATCHTAG
++fi
++echo Generating patch: $PATCHFILE
++git diff n$VERSION -- > $PATCHFILE
+diff --git a/pi-util/make_array.py b/pi-util/make_array.py
+new file mode 100755
+index 0000000000..67b22d2d51
+--- /dev/null
++++ b/pi-util/make_array.py
+@@ -0,0 +1,23 @@
++#!/usr/bin/env python
++
++# Usage
++#   make_array file.bin
++#   Produces file.h with array of bytes.
++#
++import sys
++for file in sys.argv[1:]:
++  prefix,suffix = file.split('.')
++  assert suffix=='bin'
++  name=prefix.split('/')[-1]
++  print 'Converting',file
++  with open(prefix+'.h','wb') as out:
++    print >>out, 'static const unsigned char',name,'[] = {'
++    with open(file,'rb') as fd:
++      i = 0
++      for byte in fd.read():
++        print >>out, '0x%02x, ' % ord(byte),
++        i = i + 1
++        if i % 8 == 0:
++          print >>out, ' // %04x' % (i - 8)
++    print >>out,'};'
++
+diff --git a/pi-util/mkinst.sh b/pi-util/mkinst.sh
+new file mode 100755
+index 0000000000..271a39e846
+--- /dev/null
++++ b/pi-util/mkinst.sh
+@@ -0,0 +1,5 @@
++set -e
++
++make install
++
++cp -r install/* ../vlc/sysroot/raspian_stretch_pi1-sysroot/usr
+diff --git a/pi-util/patkodi.sh b/pi-util/patkodi.sh
+new file mode 100644
+index 0000000000..dcd05a606e
+--- /dev/null
++++ b/pi-util/patkodi.sh
+@@ -0,0 +1,9 @@
++set -e
++KODIBASE=/home/jc/rpi/kodi/xbmc
++JOBS=-j20
++make $JOBS
++git diff xbmc/release/4.3-kodi > $KODIBASE/tools/depends/target/ffmpeg/pfcd_hevc_optimisations.patch
++make -C $KODIBASE/tools/depends/target/ffmpeg $JOBS
++make -C $KODIBASE/build install
++
++
+diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
+new file mode 100755
+index 0000000000..e44cfa0c3c
+--- /dev/null
++++ b/pi-util/perfcmp.py
+@@ -0,0 +1,101 @@
++#!/usr/bin/env python3
++
++import time
++import string
++import os
++import tempfile
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class tstats:
++    close_threshold = 0.01
++
++    def __init__(self, stats_dict=None):
++        if stats_dict != None:
++            self.name = stats_dict["name"]
++            self.elapsed = float(stats_dict["elapsed"])
++            self.user = float(stats_dict["user"])
++            self.sys = float(stats_dict["sys"])
++
++    def times_str(self):
++        ctime = self.sys + self.user
++        return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
++
++    def dict(self):
++        return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
++
++    def is_close(self, other):
++        return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
++
++    def __lt__(self, other):
++        return self.elapsed < other.elapsed
++    def __gt__(self, other):
++        return self.elapsed > other.elapsed
++
++    def time_file(name, prefix):
++        stats = tstats()
++        stats.name = name
++        start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
++                                  "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
++        pinfo = os.wait4(cproc.pid, 0)
++        end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
++        stats.elapsed = end_time - start_time
++        stats.user = pinfo[2].ru_utime
++        stats.sys = pinfo[2].ru_stime
++        return stats
++
++
++def common_prefix(s1, s2):
++    for i in range(min(len(s1),len(s2))):
++        if s1[i] != s2[i]:
++            return s1[:i]
++    return s1[:i+1]
++
++def main():
++    argp = argparse.ArgumentParser(description="FFmpeg performance compare")
++
++    argp.add_argument("stream0", help="CSV to compare")
++    argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
++
++    args = argp.parse_args()
++
++    with open(args.stream0, 'r', newline='') as f_in:
++        stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++    with open(args.stream1, 'r', newline='') as f_in:
++        stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
++
++    print (args.stream0, "<<-->>", args.stream1)
++    print ()
++
++    for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
++       if not (f in stats0) :
++           print ("           XX               :", f)
++           continue
++       if not (f in stats1) :
++           print ("       XX                   :", f)
++           continue
++
++       s0 = stats0[f]
++       s1 = stats1[f]
++
++       pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
++       thresh = 0.3
++       tc = 6
++
++       nchar = min(tc - 1, int(abs(pcent) / thresh))
++       cc = "  --  " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
++
++       print ("%6.2f %s%6.2f (%+5.2f) : %s" %
++           (s0.elapsed, cc, s1.elapsed, pcent, f))
++
++    return 0
++
++
++if __name__ == '__main__':
++    exit(main())
++
+diff --git a/pi-util/qem.sh b/pi-util/qem.sh
+new file mode 100755
+index 0000000000..a4dbb6eacd
+--- /dev/null
++++ b/pi-util/qem.sh
+@@ -0,0 +1,9 @@
++TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
++QASM=python\ ../local/bin/qasm.py
++SRC_FILE=libavcodec/rpi_hevc_shader.qasm
++DST_BASE=shader
++
++cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
++$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
++$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
++
+diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py
+new file mode 100755
+index 0000000000..b322dac0c2
+--- /dev/null
++++ b/pi-util/testfilt.py
+@@ -0,0 +1,83 @@
++#!/usr/bin/env python3
++
++import string
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class validator:
++    def __init__(self):
++        self.ok = False
++
++    def isok(self):
++        return self.ok
++
++    def setok(self):
++        self.ok = True
++
++class valid_regex(validator):
++    def __init__(self, regex):
++        super().__init__()
++        self.regex = re.compile(regex)
++
++    def scanline(self, line):
++        if self.isok() or self.regex.search(line):
++            self.setok()
++
++
++def validate(validators, flog):
++    for line in flog:
++        for v in validators:
++            v.scanline(line)
++
++    ok = True
++    for v in validators:
++        if not v.isok():
++            ok = False
++            # complain
++            print("Test failed")
++
++    if ok:
++        print("OK")
++    return ok
++
++def runtest(name, ffmpeg, args, suffix, validators):
++    log_root = os.path.join("/tmp", "testfilt", name)
++    ofilename = os.path.join(log_root, name + suffix)
++
++    if not os.path.exists(log_root):
++        os.makedirs(log_root)
++
++    try:
++        os.remove(ofilename)
++    except:
++        pass
++
++    flog = open(os.path.join(log_root, name + ".log"), "wb")
++    ffargs = [ffmpeg] + args + [ofilename]
++
++    subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False)
++    flog.close
++
++    flog = open(os.path.join(log_root, name + ".log"), "rt")
++    return validate(validators, flog)
++
++def sayok(log_root, flog):
++    print("Woohoo")
++    return True
++
++if __name__ == '__main__':
++
++    argp = argparse.ArgumentParser(description="FFmpeg filter tester")
++    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
++    args = argp.parse_args()
++
++    runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i",
++                                   "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv",
++#                                    "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv",
++                                   "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv",
++            [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')])
+diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
+new file mode 100755
+index 0000000000..5935a11ca5
+--- /dev/null
++++ b/pi-util/v3dusage.py
+@@ -0,0 +1,128 @@
++#!/usr/bin/env python
++
++import sys
++import argparse
++import re
++
++def do_logparse(logname):
++
++    rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
++    rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
++    rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
++    rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
++
++    ttotal = {'idle':0.0}
++    tstart = {}
++    qctotal = {}
++    qtstotal = {}
++    l2hits = {}
++    l2total = {}
++    time0 = None
++    idle_start = None
++    qpu_op_no = 0
++    op_count = 0
++
++    with open(logname, "rt") as infile:
++        for line in infile:
++            match = rmatch.match(line)
++            if match:
++#                print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
++                time = float(match.group(1))
++                unit = match.group(3)
++                opstart = not match.group(2)
++                optype = match.group(7)
++                hascb = match.group(8) != "0"
++
++                if unit == 'qpu1':
++                    unit = unit + "." + str(qpu_op_no)
++                    if not opstart:
++                        if hascb or optype == 'EXECUTE_SYNC':
++                            qpu_op_no = 0
++                        else:
++                            qpu_op_no += 1
++
++                # Ignore sync type
++                if optype == 'EXECUTE_SYNC':
++                    continue
++
++                if not time0:
++                    time0 = time
++
++                if opstart:
++                    tstart[unit] = time;
++                elif unit in tstart:
++                    op_count += 1
++                    if not unit in ttotal:
++                        ttotal[unit] = 0.0
++                    ttotal[unit] += time - tstart[unit]
++                    del tstart[unit]
++
++                if not idle_start and not tstart:
++                    idle_start = time
++                elif idle_start and tstart:
++                    ttotal['idle'] += time - idle_start
++                    idle_start = None
++
++            match = rqcycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qctotal:
++                    qctotal[unit] = 0
++                qctotal[unit] += int(match.group(2))
++
++            match = rqtscycle.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in qtstotal:
++                    qtstotal[unit] = 0
++                qtstotal[unit] += int(match.group(2))
++
++            match = rl2hits.match(line)
++            if match:
++                unit = "qpu1." + str(qpu_op_no)
++                if not unit in l2total:
++                    l2total[unit] = 0
++                    l2hits[unit] = 0
++                l2total[unit] += int(match.group(3))
++                if match.group(2) == "hits":
++                    l2hits[unit] += int(match.group(3))
++
++
++    if not time0:
++        print "No v3d profile records found"
++    else:
++        tlogged = time - time0
++
++        print "Logged time:", tlogged, "  Op count:", op_count
++        for unit in sorted(ttotal):
++            print b'%6s: %10.3f    %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
++        print
++        for unit in sorted(qctotal):
++            if not unit in qtstotal:
++                qtstotal[unit] = 0;
++            print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
++            if unit in l2total:
++                print b'        L2Total: %10d, hits:      %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
++
++
++
++if __name__ == '__main__':
++    argp = argparse.ArgumentParser(
++        formatter_class=argparse.RawDescriptionHelpFormatter,
++        description="QPU/VPU perf summary from VC logging",
++        epilog = """
++Will also summarise TMU stalls if logging requests set in qpu noflush param
++in the profiled code.
++
++Example use:
++  vcgencmd set_logging level=0xc0
++  <command to profile>
++  sudo vcdbg log msg >& t.log
++  v3dusage.py t.log
++""")
++
++    argp.add_argument("logfile")
++    args = argp.parse_args()
++
++    do_logparse(args.logfile)
++
diff --git a/tools/depends/target/ffmpeg/CMakeLists.txt b/tools/depends/target/ffmpeg/CMakeLists.txt
index 0bbc23540a..1bf9f53499 100644
--- a/tools/depends/target/ffmpeg/CMakeLists.txt
+++ b/tools/depends/target/ffmpeg/CMakeLists.txt
@@ -40,6 +40,11 @@ list(APPEND ffmpeg_conf --disable-doc
                         --extra-version="Kodi"
             )
 
+  string(CONCAT CMAKE_C_FLAGS ${CMAKE_C_FLAGS} " -I/opt/vc/include -I/opt/vc/include/interface/vcos/pthreads -I/opt/vc/include/interface/vmcs_host/linux")
+  string(CONCAT CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS} " -L/opt/vc/lib")
+  string(CONCAT CMAKE_MODULE_LINKER_FLAGS ${CMAKE_MODULE_LINKER_FLAGS} " -L/opt/vc/lib")
+  list(APPEND ffmpeg_conf --enable-sand --enable-v4l2-request --enable-libdrm --enable-libudev --disable-hwaccel=h264_v4l2request --disable-hwaccel=mpeg2_v4l2request --disable-hwaccel=vp8_v4l2request)
+
 if(CMAKE_C_FLAGS)
   list(APPEND ffmpeg_conf --extra-cflags=${CMAKE_C_FLAGS})
 endif()
-- 
2.34.1


From 13b7bc03c1be11acdc93a68aaabecb8b0483bb3c Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Thu, 14 Jan 2021 18:36:57 +0000
Subject: [PATCH 05/24] DVDVideoCodecDRMPRIME: Discard corrupt frames

ffmpeg/V4L2 decoder can set AV_FRAME_FLAG_CORRUPT if the frame failed to decode.#
Pass that onto VideoPlayer so the frame can skip renderer
---
 xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index eb2943bb8c..d8827e8296 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -582,6 +582,7 @@ void CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
 
   pVideoPicture->iRepeatPicture = 0;
   pVideoPicture->iFlags = 0;
+  pVideoPicture->iFlags |= !(m_pFrame->flags & AV_FRAME_FLAG_CORRUPT) ? 0 : DVP_FLAG_DROPPED;
   pVideoPicture->iFlags |= m_pFrame->interlaced_frame ? DVP_FLAG_INTERLACED : 0;
   pVideoPicture->iFlags |= m_pFrame->top_field_first ? DVP_FLAG_TOP_FIELD_FIRST : 0;
   pVideoPicture->iFlags |= m_pFrame->data[0] ? 0 : DVP_FLAG_DROPPED;
-- 
2.34.1


From 49ab54b99684f7a364b3b94b56c71b36625eb2fb Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Fri, 3 Dec 2021 16:00:50 +0000
Subject: [PATCH 06/24] gbm: Set max bpc for high bit depth videos

---
 .../HwDecRender/VideoLayerBridgeDRMPRIME.cpp      | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp
index 34d1ab6235..f1e73ee364 100644
--- a/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/VideoRenderers/HwDecRender/VideoLayerBridgeDRMPRIME.cpp
@@ -34,6 +34,14 @@ void CVideoLayerBridgeDRMPRIME::Disable()
 {
   // disable video plane
   auto plane = m_DRM->GetVideoPlane();
+  auto connector = m_DRM->GetConnector();
+
+  // reset max bpc back to default of 8
+  int bpc = 8;
+  bool result = m_DRM->AddProperty(connector, "max bpc", bpc);
+  CLog::Log(LOGDEBUG, "CVideoLayerBridgeDRMPRIME::{} - setting max bpc to {} ({})",
+            __FUNCTION__, bpc, result);
+
   m_DRM->AddProperty(plane, "FB_ID", 0);
   m_DRM->AddProperty(plane, "CRTC_ID", 0);
 
@@ -175,6 +183,13 @@ void CVideoLayerBridgeDRMPRIME::Configure(CVideoBufferDRMPRIME* buffer)
       plane->GetPropertyValue("COLOR_RANGE", GetColorRange(picture));
   if (colorRange)
     m_DRM->AddProperty(plane, "COLOR_RANGE", colorRange.value());
+
+  // set max bpc to allow the drm driver to choose a deep colour mode
+  int bpc = buffer->GetPicture().colorBits > 8 ? 12 : 8;
+  auto connector = m_DRM->GetConnector();
+  bool result = m_DRM->AddProperty(connector, "max bpc", bpc);
+  CLog::Log(LOGDEBUG, "CVideoLayerBridgeDRMPRIME::{} - setting max bpc to {} ({})", __FUNCTION__,
+            bpc, result);
 }
 
 void CVideoLayerBridgeDRMPRIME::SetVideoPlane(CVideoBufferDRMPRIME* buffer, const CRect& destRect)
-- 
2.34.1


From 413cf5370b1f14888c95565b05c54343a698dc0b Mon Sep 17 00:00:00 2001
From: Jonas Karlman <jonas@kwiboo.se>
Date: Sun, 20 Oct 2019 17:10:07 +0000
Subject: [PATCH 07/24] WIP: DVDVideoCodecDRMPRIME: add support for filters

---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 62 +++++++++++++++++--
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   | 10 +++
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index d8827e8296..c8e1d28bc5 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -28,6 +28,8 @@
 extern "C"
 {
 #include <libavcodec/avcodec.h>
+#include <libavfilter/buffersink.h>
+#include <libavfilter/buffersrc.h>
 #include <libavutil/error.h>
 #include <libavutil/imgutils.h>
 #include <libavutil/opt.h>
@@ -599,12 +601,30 @@ void CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
   pVideoPicture->dts = DVD_NOPTS_VALUE;
 }
 
-CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideoPicture)
+CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
 {
-  if (m_codecControlFlags & DVD_CODEC_CTRL_DRAIN)
-    Drain();
+  if (!m_pFilterIn)
+    return VC_PICTURE;
+
+  int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
+  if (ret < 0)
+  {
+    char err[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - buffersrc add frame failed: {} ({})",
+              __FUNCTION__, err, ret);
+    return VC_ERROR;
+  }
 
-  int ret = avcodec_receive_frame(m_pCodecContext, m_pFrame);
+  return ProcessFilterOut();
+}
+
+CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterOut()
+{
+  if (!m_pFilterOut)
+    return VC_EOF;
+
+  int ret = av_buffersink_get_frame(m_pFilterOut, m_pFrame);
   if (ret == AVERROR(EAGAIN))
     return VC_BUFFER;
   else if (ret == AVERROR_EOF)
@@ -621,11 +641,41 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
   {
     char err[AV_ERROR_MAX_STRING_SIZE] = {};
     av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
-    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - receive frame failed: {} ({})", __FUNCTION__,
-              err, ret);
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - buffersink get frame failed: {} ({})",
+              __FUNCTION__, err, ret);
     return VC_ERROR;
   }
 
+  return VC_PICTURE;
+}
+
+CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideoPicture)
+{
+  if (m_codecControlFlags & DVD_CODEC_CTRL_DRAIN)
+    Drain();
+
+  auto result = ProcessFilterOut();
+  if (result != VC_PICTURE)
+  {
+    int ret = avcodec_receive_frame(m_pCodecContext, m_pFrame);
+    if (ret == AVERROR(EAGAIN))
+      return VC_BUFFER;
+    else if (ret == AVERROR_EOF)
+      return VC_EOF;
+    else if (ret)
+    {
+      char err[AV_ERROR_MAX_STRING_SIZE] = {};
+      av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
+      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - receive frame failed: {} ({})",
+                __FUNCTION__, err, ret);
+      return VC_ERROR;
+    }
+
+    result = ProcessFilterIn();
+    if (result != VC_PICTURE)
+      return result;
+  }
+
   SetPictureParams(pVideoPicture);
 
   if (pVideoPicture->videoBuffer)
diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
index db49d165e7..b5cacf1a3c 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
@@ -14,6 +14,11 @@
 
 #include <memory>
 
+extern "C"
+{
+#include <libavfilter/avfilter.h>
+}
+
 class CDVDVideoCodecDRMPRIME : public CDVDVideoCodec
 {
 public:
@@ -35,6 +40,8 @@ protected:
   void Drain();
   void SetPictureParams(VideoPicture* pVideoPicture);
   void UpdateProcessInfo(struct AVCodecContext* avctx, const enum AVPixelFormat fmt);
+  CDVDVideoCodec::VCReturn ProcessFilterIn();
+  CDVDVideoCodec::VCReturn ProcessFilterOut();
   static enum AVPixelFormat GetFormat(struct AVCodecContext* avctx, const enum AVPixelFormat* fmt);
   static int GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags);
 
@@ -44,5 +51,8 @@ protected:
   double m_DAR = 1.0;
   AVCodecContext* m_pCodecContext = nullptr;
   AVFrame* m_pFrame = nullptr;
+  AVFilterGraph* m_pFilterGraph = nullptr;
+  AVFilterContext* m_pFilterIn = nullptr;
+  AVFilterContext* m_pFilterOut = nullptr;
   std::shared_ptr<IVideoBufferPool> m_videoBufferPool;
 };
-- 
2.34.1


From f84f77a9c1f0b8a3a24a9f3cd9dd9de1c8df8f66 Mon Sep 17 00:00:00 2001
From: Jernej Skrabec <jernej.skrabec@siol.net>
Date: Thu, 26 Dec 2019 11:01:51 +0100
Subject: [PATCH 08/24] WIP: DRMPRIME deinterlace filter

---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 379 +++++++++++++++---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   |   9 +-
 2 files changed, 328 insertions(+), 60 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index c8e1d28bc5..8476a3981d 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -20,6 +20,7 @@
 #include "utils/CPUInfo.h"
 #include "utils/StringUtils.h"
 #include "utils/log.h"
+#include "utils/StringUtils.h"
 
 #if defined(HAVE_GBM)
 #include "windowing/gbm/WinSystemGbm.h"
@@ -92,12 +93,15 @@ CDVDVideoCodecDRMPRIME::CDVDVideoCodecDRMPRIME(CProcessInfo& processInfo)
   : CDVDVideoCodec(processInfo)
 {
   m_pFrame = av_frame_alloc();
+  m_pFilterFrame = av_frame_alloc();
   m_videoBufferPool = std::make_shared<CVideoBufferPoolDRMPRIMEFFmpeg>();
 }
 
 CDVDVideoCodecDRMPRIME::~CDVDVideoCodecDRMPRIME()
 {
   av_frame_free(&m_pFrame);
+  av_frame_free(&m_pFilterFrame);
+  FilterClose();
   avcodec_free_context(&m_pCodecContext);
 }
 
@@ -379,8 +383,19 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
   }
 
   UpdateProcessInfo(m_pCodecContext, m_pCodecContext->pix_fmt);
-  m_processInfo.SetVideoDeintMethod("none");
+  m_processInfo.SetVideoInterlaced(false);
   m_processInfo.SetVideoDAR(hints.aspect);
+  m_processInfo.SetVideoDeintMethod("none");
+
+  FilterTest();
+
+  if (!m_deintFilterName.empty())
+  {
+    std::list<EINTERLACEMETHOD> methods;
+    methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
+    m_processInfo.UpdateDeinterlacingMethods(methods);
+    m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
+  }
 
   return true;
 }
@@ -456,6 +471,8 @@ void CDVDVideoCodecDRMPRIME::Reset()
     return;
 
   Drain();
+  m_filters.clear();
+  FilterClose();
 
   do
   {
@@ -503,7 +520,7 @@ void CDVDVideoCodecDRMPRIME::Drain()
   av_packet_free(&avpkt);
 }
 
-void CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
+bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
 {
   pVideoPicture->iWidth = m_pFrame->width;
   pVideoPicture->iHeight = m_pFrame->height;
@@ -599,13 +616,238 @@ void CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
                            ? DVD_NOPTS_VALUE
                            : static_cast<double>(pts) * DVD_TIME_BASE / AV_TIME_BASE;
   pVideoPicture->dts = DVD_NOPTS_VALUE;
+
+  if (pVideoPicture->videoBuffer)
+  {
+    pVideoPicture->videoBuffer->Release();
+    pVideoPicture->videoBuffer = nullptr;
+  }
+
+  if (IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
+  {
+    CVideoBufferDRMPRIMEFFmpeg* buffer =
+        dynamic_cast<CVideoBufferDRMPRIMEFFmpeg*>(m_videoBufferPool->Get());
+    buffer->SetPictureParams(*pVideoPicture);
+    buffer->SetRef(m_pFrame);
+    pVideoPicture->videoBuffer = buffer;
+  }
+  else if (m_pFrame->opaque)
+  {
+    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(m_pFrame->opaque);
+    buffer->SetPictureParams(*pVideoPicture);
+    buffer->Acquire();
+    buffer->SyncEnd();
+    buffer->SetDimensions(m_pFrame->width, m_pFrame->height);
+
+    pVideoPicture->videoBuffer = buffer;
+    av_frame_unref(m_pFrame);
+  }
+
+  if (!pVideoPicture->videoBuffer)
+  {
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - videoBuffer:nullptr format:{}", __FUNCTION__,
+              av_get_pix_fmt_name(static_cast<AVPixelFormat>(m_pFrame->format)));
+    av_frame_unref(m_pFrame);
+    return false;
+  }
+
+  return true;
 }
 
-CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
+void CDVDVideoCodecDRMPRIME::FilterTest()
 {
-  if (!m_pFilterIn)
-    return VC_PICTURE;
+  const AVFilter* filter;
+  void* opaque{};
+
+  m_deintFilterName.clear();
+
+  while ((filter = av_filter_iterate(&opaque)) != nullptr)
+  {
+    std::string name(filter->name);
+
+    if (name.find("deinterlace") != std::string::npos)
+    {
+      if (FilterOpen(name, true))
+      {
+        m_deintFilterName = name;
 
+        CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}",
+                  __FUNCTION__, name);
+
+        return;
+      }
+    }
+  }
+
+  CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - no deinterlacing filter found",
+            __FUNCTION__);
+}
+
+bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+{
+  int result;
+
+  if (m_pFilterGraph)
+    FilterClose();
+
+  if (filters.empty())
+    return true;
+
+  if (!(m_pFilterGraph = avfilter_graph_alloc()))
+  {
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - unable to alloc filter graph");
+    return false;
+  }
+
+  const AVFilter* srcFilter = avfilter_get_by_name("buffer");
+  const AVFilter* outFilter = avfilter_get_by_name("buffersink");
+  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
+
+  std::string args = StringUtils::Format("video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:"
+                                         "pixel_aspect=%d/%d:sws_param=flags=2",
+                                         m_pCodecContext->width,
+                                         m_pCodecContext->height,
+                                         m_pCodecContext->pix_fmt,
+                                         m_pCodecContext->time_base.num ?
+                                           m_pCodecContext->time_base.num : 1,
+                                         m_pCodecContext->time_base.num ?
+                                           m_pCodecContext->time_base.den : 1,
+                                         m_pCodecContext->sample_aspect_ratio.num != 0 ?
+                                           m_pCodecContext->sample_aspect_ratio.num : 1,
+                                         m_pCodecContext->sample_aspect_ratio.num != 0 ?
+                                           m_pCodecContext->sample_aspect_ratio.den : 1);
+
+  result = avfilter_graph_create_filter(&m_pFilterIn, srcFilter, "src",
+                                        args.c_str(), NULL, m_pFilterGraph);
+  if (result < 0)
+  {
+    char err[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
+    CLog::Log(LOGERROR,
+              "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: src: {} ({})",
+              err, result);
+    return false;
+  }
+
+  AVBufferSrcParameters *par = av_buffersrc_parameters_alloc();
+  if (!par)
+  {
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - unable to alloc buffersrc");
+    return false;
+  }
+
+  memset(par, 0, sizeof(*par));
+  par->format = AV_PIX_FMT_NONE;
+  par->hw_frames_ctx = m_pCodecContext->hw_device_ctx;
+
+  result = av_buffersrc_parameters_set(m_pFilterIn, par);
+  if (result < 0)
+  {
+    char err[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
+    CLog::Log(LOGERROR,
+              "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersrc_parameters_set:  {} ({})",
+              err, result);
+    return false;
+  }
+  av_freep(&par);
+
+  result = avfilter_graph_create_filter(&m_pFilterOut, outFilter, "out",
+                                        NULL, NULL, m_pFilterGraph);
+  if (result < 0)
+  {
+    char err[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
+    CLog::Log(LOGERROR,
+              "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: out: {} ({})",
+              err, result);
+    return false;
+  }
+
+  result = av_opt_set_int_list(m_pFilterOut, "pix_fmts", &pix_fmts[0],
+                               AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN);
+  if (result < 0)
+  {
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - failed settings pix formats");
+    return false;
+  }
+
+  AVFilterInOut* outputs = avfilter_inout_alloc();
+  AVFilterInOut* inputs  = avfilter_inout_alloc();
+
+  outputs->name = av_strdup("in");
+  outputs->filter_ctx = m_pFilterIn;
+  outputs->pad_idx = 0;
+  outputs->next = nullptr;
+
+  inputs->name = av_strdup("out");
+  inputs->filter_ctx = m_pFilterOut;
+  inputs->pad_idx = 0;
+  inputs->next = nullptr;
+
+  result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
+  avfilter_inout_free(&outputs);
+  avfilter_inout_free(&inputs);
+
+  if (result < 0)
+  {
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
+    return false;
+  }
+
+  if ((result = avfilter_graph_config(m_pFilterGraph,  nullptr)) < 0)
+  {
+    char err[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_config:  {} ({})",
+              err, result);
+    return false;
+  }
+
+  if (test)
+  {
+    FilterClose();
+    return true;
+  }
+
+  if (filters.find("deinterlace") != std::string::npos)
+  {
+    m_processInfo.SetVideoDeintMethod(filters);
+  }
+  else
+  {
+    m_processInfo.SetVideoDeintMethod("none");
+  }
+
+  if (CServiceBroker::GetLogging().CanLogComponent(LOGVIDEO))
+  {
+    char* graphDump = avfilter_graph_dump(m_pFilterGraph, nullptr);
+    if (graphDump)
+    {
+      CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::FilterOpen - Final filter graph:\n%s",
+                graphDump);
+      av_freep(&graphDump);
+    }
+  }
+
+  return true;
+}
+
+void CDVDVideoCodecDRMPRIME::FilterClose()
+{
+  if (m_pFilterGraph)
+  {
+    CLog::Log(LOGDEBUG, LOGVIDEO, "CDVDVideoCodecDRMPRIME::FilterClose - Freeing filter graph");
+    avfilter_graph_free(&m_pFilterGraph);
+
+    // Disposed by above code
+    m_pFilterIn = nullptr;
+    m_pFilterOut = nullptr;
+  }
+}
+
+CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
+{
   int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
   if (ret < 0)
   {
@@ -621,21 +863,14 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
 
 CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterOut()
 {
-  if (!m_pFilterOut)
-    return VC_EOF;
-
-  int ret = av_buffersink_get_frame(m_pFilterOut, m_pFrame);
+  int ret = av_buffersink_get_frame(m_pFilterOut, m_pFilterFrame);
   if (ret == AVERROR(EAGAIN))
     return VC_BUFFER;
   else if (ret == AVERROR_EOF)
   {
-    if (m_codecControlFlags & DVD_CODEC_CTRL_DRAIN)
-    {
-      CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - flush buffers", __FUNCTION__);
-      avcodec_flush_buffers(m_pCodecContext);
-      SetCodecControl(m_codecControlFlags & ~DVD_CODEC_CTRL_DRAIN);
-    }
-    return VC_EOF;
+    ret = av_buffersink_get_frame(m_pFilterOut, m_pFilterFrame);
+    if (ret < 0)
+      return VC_BUFFER;
   }
   else if (ret)
   {
@@ -646,71 +881,97 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterOut()
     return VC_ERROR;
   }
 
+  av_frame_unref(m_pFrame);
+  av_frame_move_ref(m_pFrame, m_pFilterFrame);
+
   return VC_PICTURE;
 }
 
+std::string CDVDVideoCodecDRMPRIME::GetFilterChain(bool interlaced)
+{
+  // ask codec to do deinterlacing if possible
+  EINTERLACEMETHOD mInt = m_processInfo.GetVideoSettings().m_InterlaceMethod;
+  std::string filterChain;
+
+  if (!m_processInfo.Supports(mInt))
+    mInt = m_processInfo.GetFallbackDeintMethod();
+
+  if (mInt != VS_INTERLACEMETHOD_NONE && interlaced && !m_deintFilterName.empty())
+    filterChain += m_deintFilterName;
+
+  return filterChain;
+}
+
 CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideoPicture)
 {
   if (m_codecControlFlags & DVD_CODEC_CTRL_DRAIN)
     Drain();
 
-  auto result = ProcessFilterOut();
-  if (result != VC_PICTURE)
+  if (m_pFilterGraph)
   {
-    int ret = avcodec_receive_frame(m_pCodecContext, m_pFrame);
-    if (ret == AVERROR(EAGAIN))
-      return VC_BUFFER;
-    else if (ret == AVERROR_EOF)
-      return VC_EOF;
-    else if (ret)
+    auto ret = ProcessFilterOut();
+    if (ret == VC_PICTURE)
     {
-      char err[AV_ERROR_MAX_STRING_SIZE] = {};
-      av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - receive frame failed: {} ({})",
-                __FUNCTION__, err, ret);
-      return VC_ERROR;
+      if (!SetPictureParams(pVideoPicture))
+        return VC_ERROR;
+      return VC_PICTURE;
+    }
+    else if (ret != VC_BUFFER)
+    {
+      return ret;
     }
-
-    result = ProcessFilterIn();
-    if (result != VC_PICTURE)
-      return result;
   }
 
-  SetPictureParams(pVideoPicture);
-
-  if (pVideoPicture->videoBuffer)
+  int ret = avcodec_receive_frame(m_pCodecContext, m_pFrame);
+  if (ret == AVERROR(EAGAIN))
+    return VC_BUFFER;
+  else if (ret == AVERROR_EOF)
+    return VC_EOF;
+  else if (ret)
   {
-    pVideoPicture->videoBuffer->Release();
-    pVideoPicture->videoBuffer = nullptr;
+    char err[AV_ERROR_MAX_STRING_SIZE] = {};
+    av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - receive frame failed: {} ({})",
+              __FUNCTION__, err, ret);
+    return VC_ERROR;
   }
 
-  if (IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
+  if (!m_processInfo.GetVideoInterlaced() && m_pFrame->interlaced_frame)
+    m_processInfo.SetVideoInterlaced(true);
+
+  std::string filterChain = GetFilterChain(m_pFrame->interlaced_frame);
+  if (!filterChain.empty())
   {
-    CVideoBufferDRMPRIMEFFmpeg* buffer =
-        dynamic_cast<CVideoBufferDRMPRIMEFFmpeg*>(m_videoBufferPool->Get());
-    buffer->SetPictureParams(*pVideoPicture);
-    buffer->SetRef(m_pFrame);
-    pVideoPicture->videoBuffer = buffer;
+    bool reopenFilter = false;
+    if (m_filters != filterChain)
+      reopenFilter = true;
+
+    if (m_pFilterGraph &&
+        (m_pFilterIn->outputs[0]->w != m_pCodecContext->width ||
+         m_pFilterIn->outputs[0]->h != m_pCodecContext->height))
+      reopenFilter = true;
+
+    if (reopenFilter)
+    {
+      m_filters = filterChain;
+      if (!FilterOpen(filterChain, false))
+        FilterClose();
+    }
+
+    if (m_pFilterGraph)
+    {
+      if (ProcessFilterIn() != VC_PICTURE)
+        return VC_NONE;
+    }
   }
-  else if (m_pFrame->opaque)
+  else
   {
-    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(m_pFrame->opaque);
-    buffer->SetPictureParams(*pVideoPicture);
-    buffer->Acquire();
-    buffer->SyncEnd();
-    buffer->SetDimensions(m_pFrame->width, m_pFrame->height);
-
-    pVideoPicture->videoBuffer = buffer;
-    av_frame_unref(m_pFrame);
+    m_filters.clear();
+    FilterClose();
   }
 
-  if (!pVideoPicture->videoBuffer)
-  {
-    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - videoBuffer:nullptr format:{}", __FUNCTION__,
-              av_get_pix_fmt_name(static_cast<AVPixelFormat>(m_pFrame->format)));
-    av_frame_unref(m_pFrame);
+  if (!SetPictureParams(pVideoPicture))
     return VC_ERROR;
-  }
 
   return VC_PICTURE;
 }
diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
index b5cacf1a3c..fab3431d40 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
@@ -38,19 +38,26 @@ public:
 
 protected:
   void Drain();
-  void SetPictureParams(VideoPicture* pVideoPicture);
+  bool SetPictureParams(VideoPicture* pVideoPicture);
   void UpdateProcessInfo(struct AVCodecContext* avctx, const enum AVPixelFormat fmt);
   CDVDVideoCodec::VCReturn ProcessFilterIn();
   CDVDVideoCodec::VCReturn ProcessFilterOut();
   static enum AVPixelFormat GetFormat(struct AVCodecContext* avctx, const enum AVPixelFormat* fmt);
   static int GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags);
+  bool FilterOpen(const std::string& filters, bool test);
+  void FilterClose();
+  void FilterTest();
+  std::string GetFilterChain(bool interlaced);
 
   std::string m_name;
+  std::string m_deintFilterName;
+  std::string m_filters;
   int m_codecControlFlags = 0;
   CDVDStreamInfo m_hints;
   double m_DAR = 1.0;
   AVCodecContext* m_pCodecContext = nullptr;
   AVFrame* m_pFrame = nullptr;
+  AVFrame* m_pFilterFrame = nullptr;
   AVFilterGraph* m_pFilterGraph = nullptr;
   AVFilterContext* m_pFilterIn = nullptr;
   AVFilterContext* m_pFilterOut = nullptr;
-- 
2.34.1


From 6328258a5eafd638962f1ae7ab69ee99d0a3fbcf Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Wed, 24 Nov 2021 20:22:41 +0000
Subject: [PATCH 09/24] CDVDVideoCodecDRMPRIME: Fix Format calls and some
 logging

---
 .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 8476a3981d..ef8819c72b 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -703,8 +703,8 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
   const AVFilter* outFilter = avfilter_get_by_name("buffersink");
   enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
 
-  std::string args = StringUtils::Format("video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:"
-                                         "pixel_aspect=%d/%d:sws_param=flags=2",
+  std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
+                                         "pixel_aspect={}/{}:sws_param=flags=2",
                                          m_pCodecContext->width,
                                          m_pCodecContext->height,
                                          m_pCodecContext->pix_fmt,
@@ -824,7 +824,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     char* graphDump = avfilter_graph_dump(m_pFilterGraph, nullptr);
     if (graphDump)
     {
-      CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::FilterOpen - Final filter graph:\n%s",
+      CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::FilterOpen - Final filter graph:\n{}",
                 graphDump);
       av_freep(&graphDump);
     }
-- 
2.34.1


From 9e3d889343073fda5fc3e02fbfc205eeb5bbc929 Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Fri, 27 Aug 2021 20:29:50 +0100
Subject: [PATCH 10/24] DVDVideoCodecDRMPRIME: Avoid exception with
 AV_PIX_FMT_NONE

---
 .../cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index ef8819c72b..98b0830488 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -646,7 +646,7 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
   if (!pVideoPicture->videoBuffer)
   {
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - videoBuffer:nullptr format:{}", __FUNCTION__,
-              av_get_pix_fmt_name(static_cast<AVPixelFormat>(m_pFrame->format)));
+              m_pFrame->format == AV_PIX_FMT_NONE ? "AV_PIX_FMT_NONE" : av_get_pix_fmt_name(static_cast<AVPixelFormat>(m_pFrame->format)));
     av_frame_unref(m_pFrame);
     return false;
   }
-- 
2.34.1


From b117d37af068b5958dcc5de2e3395c7664fa7077 Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Sat, 11 Sep 2021 14:03:05 +0100
Subject: [PATCH 11/24] CDVDVideoCodecDRMPRIME: Also support YUV420 buffers

CDVDVideoCodecDRMPRIME: Add support for deinterlace of sw decoded buffers

Need to call SetDimensions earlier and store the drm descriptor in expected place
---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp      | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 98b0830488..72064b8310 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -623,7 +623,7 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
     pVideoPicture->videoBuffer = nullptr;
   }
 
-  if (IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
+  if (m_pFrame->format == AV_PIX_FMT_DRM_PRIME)
   {
     CVideoBufferDRMPRIMEFFmpeg* buffer =
         dynamic_cast<CVideoBufferDRMPRIMEFFmpeg*>(m_videoBufferPool->Get());
@@ -701,7 +701,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
 
   const AVFilter* srcFilter = avfilter_get_by_name("buffer");
   const AVFilter* outFilter = avfilter_get_by_name("buffersink");
-  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
+  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE };
 
   std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
                                          "pixel_aspect={}/{}:sws_param=flags=2",
@@ -848,6 +848,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
 
 CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
 {
+  // sw decoded buffers need cache flush and for descripter to be set
+  if (!IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && m_pFrame->opaque != nullptr)
+  {
+    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(m_pFrame->opaque);
+    buffer->SetDimensions(m_pFrame->width, m_pFrame->height);
+    buffer->SyncEnd();
+    auto descriptor = buffer->GetDescriptor();
+    m_pFrame->data[0] = reinterpret_cast<uint8_t*>(descriptor);
+  }
+
   int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
   if (ret < 0)
   {
-- 
2.34.1


From b40ce61b487f4d2883da72de7cc650f12fcd47e8 Mon Sep 17 00:00:00 2001
From: popcornmix <popcornmix@gmail.com>
Date: Fri, 17 Sep 2021 15:23:16 +0100
Subject: [PATCH 12/24] DVDVideoCodecDRMPRIME: Leave deinterlace filter active
 on a progressive frame

Interlaced content often has strange mixtures of interlace and progressive frames (e.g. IIPPPPIIPPPP)
and currently we can be creating and destroying the deinterlace filter graph almost every frame.

If it's been created, then leave it active until end of file. The frames marked as progressive should
be just copied by deinterlace filter
---
 .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 72064b8310..76f9ad49cd 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -906,6 +906,10 @@ std::string CDVDVideoCodecDRMPRIME::GetFilterChain(bool interlaced)
   if (!m_processInfo.Supports(mInt))
     mInt = m_processInfo.GetFallbackDeintMethod();
 
+  // avoid disabling deinterlace graph for occasional progressive frames - they will be copied by deinterlace
+  if (!m_filters.empty())
+    interlaced = true;
+
   if (mInt != VS_INTERLACEMETHOD_NONE && interlaced && !m_deintFilterName.empty())
     filterChain += m_deintFilterName;
 
-- 
2.34.1


From 74ad14a451626fec56a25803e8ffa6f20fef4307 Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Tue, 30 Nov 2021 16:05:06 +0000
Subject: [PATCH 13/24] SetVideoInterlaced: Set and unset deinterlace method
 name reported

---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp          | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 76f9ad49cd..a49418bc0e 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -810,14 +810,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     return true;
   }
 
-  if (filters.find("deinterlace") != std::string::npos)
-  {
-    m_processInfo.SetVideoDeintMethod(filters);
-  }
-  else
-  {
-    m_processInfo.SetVideoDeintMethod("none");
-  }
+  m_processInfo.SetVideoDeintMethod(filters);
 
   if (CServiceBroker::GetLogging().CanLogComponent(LOGVIDEO))
   {
@@ -835,6 +828,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
 
 void CDVDVideoCodecDRMPRIME::FilterClose()
 {
+  m_processInfo.SetVideoDeintMethod("none");
   if (m_pFilterGraph)
   {
     CLog::Log(LOGDEBUG, LOGVIDEO, "CDVDVideoCodecDRMPRIME::FilterClose - Freeing filter graph");
-- 
2.34.1


From 6285a4d1465c6dc193d49a1752b7a9b6e90c3686 Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Wed, 24 Nov 2021 20:21:28 +0000
Subject: [PATCH 14/24] DVDVideoCodecDRMPRIME: Close deinterlace filter on
 error

Otherwise we crash later with an invalid m_pFilterGraph pointer
---
 .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index a49418bc0e..fa7c4b28e5 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -726,6 +726,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     CLog::Log(LOGERROR,
               "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: src: {} ({})",
               err, result);
+    FilterClose();
     return false;
   }
 
@@ -733,6 +734,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
   if (!par)
   {
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - unable to alloc buffersrc");
+    FilterClose();
     return false;
   }
 
@@ -748,6 +750,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     CLog::Log(LOGERROR,
               "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersrc_parameters_set:  {} ({})",
               err, result);
+    FilterClose();
     return false;
   }
   av_freep(&par);
@@ -761,6 +764,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     CLog::Log(LOGERROR,
               "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: out: {} ({})",
               err, result);
+    FilterClose();
     return false;
   }
 
@@ -769,6 +773,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
   if (result < 0)
   {
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - failed settings pix formats");
+    FilterClose();
     return false;
   }
 
@@ -792,6 +797,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
   if (result < 0)
   {
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
+    FilterClose();
     return false;
   }
 
@@ -801,6 +807,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_config:  {} ({})",
               err, result);
+    FilterClose();
     return false;
   }
 
-- 
2.34.1


From 9b2aa82a0a69fe6c06249a43496256f29c87b19e Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Wed, 18 Jan 2023 16:41:00 +0000
Subject: [PATCH 15/24] CDVDVideoCodecDRMPRIME: Adjust av formats to match
 recent ffmpeg changes

---
 .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index fa7c4b28e5..ced0ebac8d 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -355,6 +355,7 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
   m_pCodecContext->bits_per_coded_sample = hints.bitsperpixel;
   m_pCodecContext->time_base.num = 1;
   m_pCodecContext->time_base.den = DVD_TIME_BASE;
+  m_pCodecContext->thread_safe_callbacks = 1;
   m_pCodecContext->thread_count = CServiceBroker::GetCPUInfo()->GetCPUCount();
 
   if (hints.extradata)
@@ -701,13 +702,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
 
   const AVFilter* srcFilter = avfilter_get_by_name("buffer");
   const AVFilter* outFilter = avfilter_get_by_name("buffersink");
-  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE };
+  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
 
   std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
-                                         "pixel_aspect={}/{}:sws_param=flags=2",
+                                         "pixel_aspect={}/{}",
                                          m_pCodecContext->width,
                                          m_pCodecContext->height,
-                                         m_pCodecContext->pix_fmt,
+                                         AV_PIX_FMT_DRM_PRIME,
                                          m_pCodecContext->time_base.num ?
                                            m_pCodecContext->time_base.num : 1,
                                          m_pCodecContext->time_base.num ?
@@ -859,6 +860,7 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
     m_pFrame->data[0] = reinterpret_cast<uint8_t*>(descriptor);
   }
 
+  m_pFrame->format = AV_PIX_FMT_DRM_PRIME;
   int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
   if (ret < 0)
   {
-- 
2.34.1


From 3524b47a3153011d6c5afddddac3c280b7f37c8a Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Mon, 6 Feb 2023 15:19:51 +0000
Subject: [PATCH 16/24] DVDVideoCodecDRMPRIME: Add support for arbitrary output
 pixel formats

This enables any ffmpeg pixel formats to be supported by DRMPRIME decoder
by creating a scale ffmpeg filter to convert it to a supported format.

This allows formats like h264 Hi10P and hevc 12-bit 444 to be software decoded,
converted and displayed through DRM.

This will be a cheaper path than disabling DRMPRIME, which is also
software decode, convert, but then needs convert to texture and display through GL.

And it happens automatically without requiring user video settings
---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 124 +++++++++++-------
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   |   3 +-
 2 files changed, 77 insertions(+), 50 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index ced0ebac8d..62fc0cf822 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -219,7 +219,7 @@ enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avct
 {
   for (int n = 0; fmt[n] != AV_PIX_FMT_NONE; n++)
   {
-    if (IsSupportedHwFormat(fmt[n]) || IsSupportedSwFormat(fmt[n]))
+    //if (IsSupportedHwFormat(fmt[n]) || IsSupportedSwFormat(fmt[n]))
     {
       CDVDVideoCodecDRMPRIME* ctx = static_cast<CDVDVideoCodecDRMPRIME*>(avctx->opaque);
       ctx->UpdateProcessInfo(avctx, fmt[n]);
@@ -240,7 +240,8 @@ enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avct
 
 int CDVDVideoCodecDRMPRIME::GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags)
 {
-  if (IsSupportedSwFormat(static_cast<AVPixelFormat>(frame->format)))
+  AVPixelFormat pix_fmt = static_cast<AVPixelFormat>(frame->format);
+  if (IsSupportedSwFormat(pix_fmt))
   {
     int width = frame->width;
     int height = frame->height;
@@ -248,7 +249,7 @@ int CDVDVideoCodecDRMPRIME::GetBuffer(struct AVCodecContext* avctx, AVFrame* fra
     AlignedSize(avctx, width, height);
 
     int size;
-    switch (avctx->pix_fmt)
+    switch (pix_fmt)
     {
       case AV_PIX_FMT_YUV420P:
       case AV_PIX_FMT_YUVJ420P:
@@ -268,13 +269,12 @@ int CDVDVideoCodecDRMPRIME::GetBuffer(struct AVCodecContext* avctx, AVFrame* fra
 
     CDVDVideoCodecDRMPRIME* ctx = static_cast<CDVDVideoCodecDRMPRIME*>(avctx->opaque);
     auto buffer = dynamic_cast<CVideoBufferDMA*>(
-        ctx->m_processInfo.GetVideoBufferManager().Get(avctx->pix_fmt, size, nullptr));
+        ctx->m_processInfo.GetVideoBufferManager().Get(pix_fmt, size, nullptr));
     if (!buffer)
       return -1;
 
-    frame->opaque = static_cast<void*>(buffer);
     frame->opaque_ref =
-        av_buffer_create(nullptr, 0, ReleaseBuffer, frame->opaque, AV_BUFFER_FLAG_READONLY);
+        av_buffer_create(nullptr, 0, ReleaseBuffer, static_cast<void*>(buffer), AV_BUFFER_FLAG_READONLY);
 
     buffer->Export(frame, width, height);
     buffer->SyncStart();
@@ -632,9 +632,9 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
     buffer->SetRef(m_pFrame);
     pVideoPicture->videoBuffer = buffer;
   }
-  else if (m_pFrame->opaque)
+  else if (IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
   {
-    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(m_pFrame->opaque);
+    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(av_buffer_get_opaque(m_pFrame->buf[0]));
     buffer->SetPictureParams(*pVideoPicture);
     buffer->Acquire();
     buffer->SyncEnd();
@@ -668,13 +668,13 @@ void CDVDVideoCodecDRMPRIME::FilterTest()
 
     if (name.find("deinterlace") != std::string::npos)
     {
-      if (FilterOpen(name, true))
+      bool ret = FilterOpen(name, false, true);
+      FilterClose();
+      if (ret)
       {
         m_deintFilterName = name;
-
         CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}",
                   __FUNCTION__, name);
-
         return;
       }
     }
@@ -684,14 +684,31 @@ void CDVDVideoCodecDRMPRIME::FilterTest()
             __FUNCTION__);
 }
 
-bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
+AVFrame *CDVDVideoCodecDRMPRIME::alloc_filter_frame(AVFilterContext * ctx, void * v, int w, int h)
+{
+  int result;
+  CDVDVideoCodecDRMPRIME* me = static_cast<CDVDVideoCodecDRMPRIME*>(v);
+  AVFrame *frame = av_frame_alloc();
+  frame->width = w;
+  frame->height = h;
+  frame->format = AV_PIX_FMT_YUV420P;
+
+  if ((result = CDVDVideoCodecDRMPRIME::GetBuffer(me->m_pCodecContext, frame, 0)) < 0)
+  {
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::alloc_filter_frame - failed to GetBuffer ({})", result);
+    return nullptr;
+  }
+  return frame;
+}
+
+bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, bool test)
 {
   int result;
 
   if (m_pFilterGraph)
     FilterClose();
 
-  if (filters.empty())
+  if (filters.empty() && !scale)
     return true;
 
   if (!(m_pFilterGraph = avfilter_graph_alloc()))
@@ -702,13 +719,13 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
 
   const AVFilter* srcFilter = avfilter_get_by_name("buffer");
   const AVFilter* outFilter = avfilter_get_by_name("buffersink");
-  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
+  enum AVPixelFormat pix_fmts[] = { scale ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
 
   std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
                                          "pixel_aspect={}/{}",
                                          m_pCodecContext->width,
                                          m_pCodecContext->height,
-                                         AV_PIX_FMT_DRM_PRIME,
+                                         scale ? m_pCodecContext->pix_fmt : AV_PIX_FMT_DRM_PRIME,
                                          m_pCodecContext->time_base.num ?
                                            m_pCodecContext->time_base.num : 1,
                                          m_pCodecContext->time_base.num ?
@@ -727,7 +744,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     CLog::Log(LOGERROR,
               "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: src: {} ({})",
               err, result);
-    FilterClose();
     return false;
   }
 
@@ -735,7 +751,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
   if (!par)
   {
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - unable to alloc buffersrc");
-    FilterClose();
     return false;
   }
 
@@ -751,7 +766,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     CLog::Log(LOGERROR,
               "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersrc_parameters_set:  {} ({})",
               err, result);
-    FilterClose();
     return false;
   }
   av_freep(&par);
@@ -765,7 +779,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     CLog::Log(LOGERROR,
               "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_create_filter: out: {} ({})",
               err, result);
-    FilterClose();
     return false;
   }
 
@@ -774,32 +787,46 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
   if (result < 0)
   {
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - failed settings pix formats");
-    FilterClose();
     return false;
   }
 
-  AVFilterInOut* outputs = avfilter_inout_alloc();
-  AVFilterInOut* inputs  = avfilter_inout_alloc();
+  if (!filters.empty())
+  {
+    AVFilterInOut* outputs = avfilter_inout_alloc();
+    AVFilterInOut* inputs  = avfilter_inout_alloc();
 
-  outputs->name = av_strdup("in");
-  outputs->filter_ctx = m_pFilterIn;
-  outputs->pad_idx = 0;
-  outputs->next = nullptr;
+    outputs->name = av_strdup("in");
+    outputs->filter_ctx = m_pFilterIn;
+    outputs->pad_idx = 0;
+    outputs->next = nullptr;
 
-  inputs->name = av_strdup("out");
-  inputs->filter_ctx = m_pFilterOut;
-  inputs->pad_idx = 0;
-  inputs->next = nullptr;
+    inputs->name = av_strdup("out");
+    inputs->filter_ctx = m_pFilterOut;
+    inputs->pad_idx = 0;
+    inputs->next = nullptr;
 
-  result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
-  avfilter_inout_free(&outputs);
-  avfilter_inout_free(&inputs);
+    result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
+    avfilter_inout_free(&outputs);
+    avfilter_inout_free(&inputs);
 
-  if (result < 0)
+    if (result < 0)
+    {
+      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
+      return false;
+    }
+  }
+  else
   {
-    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
-    FilterClose();
-    return false;
+    if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast<void*>(this))) < 0)
+    {
+      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result);
+      return result;
+    }
+    if ((result = avfilter_link(m_pFilterIn, 0, m_pFilterOut, 0)) < 0)
+    {
+      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_link");
+      return false;
+    }
   }
 
   if ((result = avfilter_graph_config(m_pFilterGraph,  nullptr)) < 0)
@@ -808,15 +835,11 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool test)
     av_strerror(result, err, AV_ERROR_MAX_STRING_SIZE);
     CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_config:  {} ({})",
               err, result);
-    FilterClose();
     return false;
   }
 
   if (test)
-  {
-    FilterClose();
     return true;
-  }
 
   m_processInfo.SetVideoDeintMethod(filters);
 
@@ -851,16 +874,16 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
 CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
 {
   // sw decoded buffers need cache flush and for descripter to be set
-  if (!IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && m_pFrame->opaque != nullptr)
+  if (!IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
   {
-    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(m_pFrame->opaque);
+    CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(av_buffer_get_opaque(m_pFrame->buf[0]));
     buffer->SetDimensions(m_pFrame->width, m_pFrame->height);
     buffer->SyncEnd();
     auto descriptor = buffer->GetDescriptor();
     m_pFrame->data[0] = reinterpret_cast<uint8_t*>(descriptor);
+    m_pFrame->format = AV_PIX_FMT_DRM_PRIME;
   }
 
-  m_pFrame->format = AV_PIX_FMT_DRM_PRIME;
   int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
   if (ret < 0)
   {
@@ -953,25 +976,28 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
     return VC_ERROR;
   }
 
+  // we need to scale if the buffer isn't in DRM_PRIME format
+  bool need_scale = !IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && !IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format));
+
   if (!m_processInfo.GetVideoInterlaced() && m_pFrame->interlaced_frame)
     m_processInfo.SetVideoInterlaced(true);
 
   std::string filterChain = GetFilterChain(m_pFrame->interlaced_frame);
-  if (!filterChain.empty())
+  if (!filterChain.empty() || need_scale)
   {
     bool reopenFilter = false;
     if (m_filters != filterChain)
       reopenFilter = true;
 
     if (m_pFilterGraph &&
-        (m_pFilterIn->outputs[0]->w != m_pCodecContext->width ||
-         m_pFilterIn->outputs[0]->h != m_pCodecContext->height))
+        (m_pFilterIn->outputs[0]->w != m_pFrame->width ||
+         m_pFilterIn->outputs[0]->h != m_pFrame->height))
       reopenFilter = true;
 
-    if (reopenFilter)
+    if (reopenFilter || (need_scale && m_pFilterGraph == nullptr))
     {
       m_filters = filterChain;
-      if (!FilterOpen(filterChain, false))
+      if (!FilterOpen(filterChain, need_scale, false))
         FilterClose();
     }
 
diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
index fab3431d40..bb88fde1f9 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
@@ -44,7 +44,8 @@ protected:
   CDVDVideoCodec::VCReturn ProcessFilterOut();
   static enum AVPixelFormat GetFormat(struct AVCodecContext* avctx, const enum AVPixelFormat* fmt);
   static int GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags);
-  bool FilterOpen(const std::string& filters, bool test);
+  static AVFrame *alloc_filter_frame(AVFilterContext * ctx, void * v, int w, int h);
+  bool FilterOpen(const std::string& filters, bool scale, bool test);
   void FilterClose();
   void FilterTest();
   std::string GetFilterChain(bool interlaced);
-- 
2.34.1


From 9d7c4cd5305a52b7806029860b40d79348475cdf Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Fri, 14 Apr 2023 19:59:42 +0100
Subject: [PATCH 17/24] DVDVideoCodecDRMPRIME: Remove obsolete
 thread_safe_callbacks

---
 xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 62fc0cf822..3ed59af9f7 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -355,7 +355,6 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
   m_pCodecContext->bits_per_coded_sample = hints.bitsperpixel;
   m_pCodecContext->time_base.num = 1;
   m_pCodecContext->time_base.den = DVD_TIME_BASE;
-  m_pCodecContext->thread_safe_callbacks = 1;
   m_pCodecContext->thread_count = CServiceBroker::GetCPUInfo()->GetCPUCount();
 
   if (hints.extradata)
-- 
2.34.1


From 0c73ce0ada72ec08efbb4b77e7401d91e498f56d Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Mon, 15 May 2023 12:50:16 +0100
Subject: [PATCH 18/24] DVDVideoCodecDRMPRIME: Fix missing flush after eof

---
 .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 3ed59af9f7..c9ea5d52d5 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -965,7 +965,15 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
   if (ret == AVERROR(EAGAIN))
     return VC_BUFFER;
   else if (ret == AVERROR_EOF)
+  {
+    if (m_codecControlFlags & DVD_CODEC_CTRL_DRAIN)
+    {
+      CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - flush buffers", __FUNCTION__);
+      avcodec_flush_buffers(m_pCodecContext);
+      SetCodecControl(m_codecControlFlags & ~DVD_CODEC_CTRL_DRAIN);
+    }
     return VC_EOF;
+  }
   else if (ret)
   {
     char err[AV_ERROR_MAX_STRING_SIZE] = {};
-- 
2.34.1


From 647cfba9f3f8feb7b4b9b31a7a235e8d7fbb066c Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Wed, 31 May 2023 19:40:37 +0100
Subject: [PATCH 19/24] DVDVideoCodecDRMPRIME: Clear m_pFilterGraph

---
 xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index c9ea5d52d5..a3eecf0aed 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -867,6 +867,7 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
     // Disposed by above code
     m_pFilterIn = nullptr;
     m_pFilterOut = nullptr;
+    m_pFilterGraph = nullptr;
   }
 }
 
-- 
2.34.1


From bf72fb426a6f1d6f2903d3f0fc825f3b69c2eea4 Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Fri, 2 Jun 2023 11:34:22 +0100
Subject: [PATCH 20/24] DVDVideoCodecDRMPRIME: Move FilterTest from open to
 first frame returned

The pixel format is not accurate until the first frame is returned
and it may (later) influence the choice of deinterlacers available.
---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 24 ++++++++++++-------
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   |  1 +
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index a3eecf0aed..c2d1e496e0 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -387,15 +387,7 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
   m_processInfo.SetVideoDAR(hints.aspect);
   m_processInfo.SetVideoDeintMethod("none");
 
-  FilterTest();
-
-  if (!m_deintFilterName.empty())
-  {
-    std::list<EINTERLACEMETHOD> methods;
-    methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
-    m_processInfo.UpdateDeinterlacingMethods(methods);
-    m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
-  }
+  m_checkedDeinterlace = false;
 
   return true;
 }
@@ -984,6 +976,20 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
     return VC_ERROR;
   }
 
+  if (!m_checkedDeinterlace)
+  {
+    FilterTest();
+
+    if (!m_deintFilterName.empty())
+    {
+      std::list<EINTERLACEMETHOD> methods;
+      methods.push_back(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
+      m_processInfo.UpdateDeinterlacingMethods(methods);
+      m_processInfo.SetDeinterlacingMethodDefault(EINTERLACEMETHOD::VS_INTERLACEMETHOD_DEINTERLACE);
+    }
+    m_checkedDeinterlace = true;
+  }
+
   // we need to scale if the buffer isn't in DRM_PRIME format
   bool need_scale = !IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && !IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format));
 
diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
index bb88fde1f9..df17f89b96 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
@@ -56,6 +56,7 @@ protected:
   int m_codecControlFlags = 0;
   CDVDStreamInfo m_hints;
   double m_DAR = 1.0;
+  bool m_checkedDeinterlace = false;
   AVCodecContext* m_pCodecContext = nullptr;
   AVFrame* m_pFrame = nullptr;
   AVFrame* m_pFilterFrame = nullptr;
-- 
2.34.1


From d1ca2d8b7bf6bcf3abe5dbffabb0d03f432925ed Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Wed, 31 May 2023 14:19:20 +0100
Subject: [PATCH 21/24] DVDVideoCodecDRMPRIME: Rework filtering code to handle
 sw deinterlace

---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 134 +++++++++---------
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.h   |   4 +-
 2 files changed, 68 insertions(+), 70 deletions(-)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index c2d1e496e0..521a4c174b 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -207,11 +207,7 @@ static const AVCodec* FindDecoder(CDVDStreamInfo& hints)
         return codec;
     }
 
-  codec = avcodec_find_decoder(hints.codec);
-  if (codec && (codec->capabilities & AV_CODEC_CAP_DR1) == AV_CODEC_CAP_DR1)
-    return codec;
-
-  return nullptr;
+  return avcodec_find_decoder(hints.codec);
 }
 
 enum AVPixelFormat CDVDVideoCodecDRMPRIME::GetFormat(struct AVCodecContext* avctx,
@@ -646,27 +642,33 @@ bool CDVDVideoCodecDRMPRIME::SetPictureParams(VideoPicture* pVideoPicture)
   return true;
 }
 
-void CDVDVideoCodecDRMPRIME::FilterTest()
+void CDVDVideoCodecDRMPRIME::FilterTest(AVPixelFormat pix_fmt)
 {
-  const AVFilter* filter;
-  void* opaque{};
-
   m_deintFilterName.clear();
 
-  while ((filter = av_filter_iterate(&opaque)) != nullptr)
+  // look twice, first for DRM_PRIME support, then for actual pixel format
+  for (int i=0; i < 2; i++)
   {
-    std::string name(filter->name);
+    const AVFilter* filter;
+    void* opaque{};
 
-    if (name.find("deinterlace") != std::string::npos)
+    while ((filter = av_filter_iterate(&opaque)) != nullptr)
     {
-      bool ret = FilterOpen(name, false, true);
-      FilterClose();
-      if (ret)
+      std::string name(filter->name);
+
+      if (name.find(i == 0 ? "deinterlace" : "bwdif") != std::string::npos)
       {
-        m_deintFilterName = name;
-        CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}",
-                  __FUNCTION__, name);
-        return;
+        bool ret = FilterOpen(name, pix_fmt, true);
+        FilterClose();
+        if (ret)
+        {
+          m_deintFilterName = name;
+          if (name == "bwdif" || name == "yadif")
+            m_deintFilterName += "=1:-1:1";
+          CLog::Log(LOGDEBUG, "CDVDVideoCodecDRMPRIME::{} - found deinterlacing filter {}",
+                    __FUNCTION__, name);
+          return;
+        }
       }
     }
   }
@@ -692,14 +694,17 @@ AVFrame *CDVDVideoCodecDRMPRIME::alloc_filter_frame(AVFilterContext * ctx, void
   return frame;
 }
 
-bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale, bool test)
+bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, AVPixelFormat pix_fmt, bool test)
 {
   int result;
 
+  if (filters.find("deinterlace") != std::string::npos && pix_fmt == AV_PIX_FMT_YUV420P)
+     pix_fmt = AV_PIX_FMT_DRM_PRIME;
+
   if (m_pFilterGraph)
     FilterClose();
 
-  if (filters.empty() && !scale)
+  if (filters.empty())
     return true;
 
   if (!(m_pFilterGraph = avfilter_graph_alloc()))
@@ -710,13 +715,12 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
 
   const AVFilter* srcFilter = avfilter_get_by_name("buffer");
   const AVFilter* outFilter = avfilter_get_by_name("buffersink");
-  enum AVPixelFormat pix_fmts[] = { scale ? AV_PIX_FMT_YUV420P : AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_NONE };
 
   std::string args = StringUtils::Format("video_size={}x{}:pix_fmt={}:time_base={}/{}:"
                                          "pixel_aspect={}/{}",
                                          m_pCodecContext->width,
                                          m_pCodecContext->height,
-                                         scale ? m_pCodecContext->pix_fmt : AV_PIX_FMT_DRM_PRIME,
+                                         pix_fmt,
                                          m_pCodecContext->time_base.num ?
                                            m_pCodecContext->time_base.num : 1,
                                          m_pCodecContext->time_base.num ?
@@ -773,6 +777,7 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
     return false;
   }
 
+  enum AVPixelFormat pix_fmts[] = { AV_PIX_FMT_DRM_PRIME, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE };
   result = av_opt_set_int_list(m_pFilterOut, "pix_fmts", &pix_fmts[0],
                                AV_PIX_FMT_NONE, AV_OPT_SEARCH_CHILDREN);
   if (result < 0)
@@ -781,43 +786,32 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
     return false;
   }
 
-  if (!filters.empty())
+  if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast<void*>(this))) < 0)
   {
-    AVFilterInOut* outputs = avfilter_inout_alloc();
-    AVFilterInOut* inputs  = avfilter_inout_alloc();
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result);
+    return result;
+  }
+  AVFilterInOut* outputs = avfilter_inout_alloc();
+  AVFilterInOut* inputs  = avfilter_inout_alloc();
 
-    outputs->name = av_strdup("in");
-    outputs->filter_ctx = m_pFilterIn;
-    outputs->pad_idx = 0;
-    outputs->next = nullptr;
+  outputs->name = av_strdup("in");
+  outputs->filter_ctx = m_pFilterIn;
+  outputs->pad_idx = 0;
+  outputs->next = nullptr;
 
-    inputs->name = av_strdup("out");
-    inputs->filter_ctx = m_pFilterOut;
-    inputs->pad_idx = 0;
-    inputs->next = nullptr;
+  inputs->name = av_strdup("out");
+  inputs->filter_ctx = m_pFilterOut;
+  inputs->pad_idx = 0;
+  inputs->next = nullptr;
 
-    result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
-    avfilter_inout_free(&outputs);
-    avfilter_inout_free(&inputs);
+  result = avfilter_graph_parse_ptr(m_pFilterGraph, filters.c_str(), &inputs, &outputs, NULL);
+  avfilter_inout_free(&outputs);
+  avfilter_inout_free(&inputs);
 
-    if (result < 0)
-    {
-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
-      return false;
-    }
-  }
-  else
+  if (result < 0)
   {
-    if ((result = av_buffersink_set_alloc_video_frame(m_pFilterOut, alloc_filter_frame, static_cast<void*>(this))) < 0)
-    {
-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - av_buffersink_set_alloc_video_frame = {}", result);
-      return result;
-    }
-    if ((result = avfilter_link(m_pFilterIn, 0, m_pFilterOut, 0)) < 0)
-    {
-      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_link");
-      return false;
-    }
+    CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::FilterOpen - avfilter_graph_parse");
+    return false;
   }
 
   if ((result = avfilter_graph_config(m_pFilterGraph,  nullptr)) < 0)
@@ -832,8 +826,6 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, bool scale,
   if (test)
     return true;
 
-  m_processInfo.SetVideoDeintMethod(filters);
-
   if (CServiceBroker::GetLogging().CanLogComponent(LOGVIDEO))
   {
     char* graphDump = avfilter_graph_dump(m_pFilterGraph, nullptr);
@@ -865,8 +857,8 @@ void CDVDVideoCodecDRMPRIME::FilterClose()
 
 CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
 {
-  // sw decoded buffers need cache flush and for descripter to be set
-  if (!IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)))
+  // sw decoded buffers submitted to hw decoder need cache flush and for descripter to be set
+  if (m_pFrame->format != AV_PIX_FMT_DRM_PRIME && m_pFilterGraph && m_pFilterIn->outputs[0]->format == AV_PIX_FMT_DRM_PRIME)
   {
     CVideoBufferDMA* buffer = static_cast<CVideoBufferDMA*>(av_buffer_get_opaque(m_pFrame->buf[0]));
     buffer->SetDimensions(m_pFrame->width, m_pFrame->height);
@@ -976,9 +968,10 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
     return VC_ERROR;
   }
 
+  AVPixelFormat pix_fmt = static_cast<AVPixelFormat>(m_pFrame->format);
   if (!m_checkedDeinterlace)
   {
-    FilterTest();
+    FilterTest(pix_fmt);
 
     if (!m_deintFilterName.empty())
     {
@@ -990,28 +983,33 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::GetPicture(VideoPicture* pVideo
     m_checkedDeinterlace = true;
   }
 
-  // we need to scale if the buffer isn't in DRM_PRIME format
-  bool need_scale = !IsSupportedSwFormat(static_cast<AVPixelFormat>(m_pFrame->format)) && !IsSupportedHwFormat(static_cast<AVPixelFormat>(m_pFrame->format));
-
   if (!m_processInfo.GetVideoInterlaced() && m_pFrame->interlaced_frame)
     m_processInfo.SetVideoInterlaced(true);
 
   std::string filterChain = GetFilterChain(m_pFrame->interlaced_frame);
-  if (!filterChain.empty() || need_scale)
+
+  // we need to scale if the buffer isn't in DRM_PRIME format
+  if (!IsSupportedSwFormat(pix_fmt) && !IsSupportedHwFormat(pix_fmt))
+    filterChain = "scale";
+  // we need to copy if the buffer wasn't allocated by us
+  else if (!IsSupportedHwFormat(pix_fmt) && !(m_pCodecContext->codec->capabilities & AV_CODEC_CAP_DR1))
+    filterChain = "copy";
+
+  if (!filterChain.empty())
   {
-    bool reopenFilter = false;
-    if (m_filters != filterChain)
-      reopenFilter = true;
+    bool reopenFilter = m_filters != filterChain;
 
     if (m_pFilterGraph &&
         (m_pFilterIn->outputs[0]->w != m_pFrame->width ||
          m_pFilterIn->outputs[0]->h != m_pFrame->height))
       reopenFilter = true;
 
-    if (reopenFilter || (need_scale && m_pFilterGraph == nullptr))
+    if (reopenFilter)
     {
       m_filters = filterChain;
-      if (!FilterOpen(filterChain, need_scale, false))
+      m_processInfo.SetVideoDeintMethod(m_filters);
+
+      if (!FilterOpen(filterChain, pix_fmt, false))
         FilterClose();
     }
 
diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
index df17f89b96..55675c3c2e 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.h
@@ -45,9 +45,9 @@ protected:
   static enum AVPixelFormat GetFormat(struct AVCodecContext* avctx, const enum AVPixelFormat* fmt);
   static int GetBuffer(struct AVCodecContext* avctx, AVFrame* frame, int flags);
   static AVFrame *alloc_filter_frame(AVFilterContext * ctx, void * v, int w, int h);
-  bool FilterOpen(const std::string& filters, bool scale, bool test);
+  bool FilterOpen(const std::string& filters, AVPixelFormat pix_fmt, bool test);
   void FilterClose();
-  void FilterTest();
+  void FilterTest(AVPixelFormat pix_fmt);
   std::string GetFilterChain(bool interlaced);
 
   std::string m_name;
-- 
2.34.1


From 5c5a019635595b296c0f52783e59a8fb85ee9694 Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Tue, 20 Jun 2023 15:13:09 +0100
Subject: [PATCH 22/24] CDVDVideoCodecDRMPRIME: Support decoding to DRMPRIME
 with sw deinterlace

We can map a YUV style DRM_PRIME buffer back to AV_PIX_FMT_YUV420P
to allow subsquent sw deinterlace
---
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 521a4c174b..326d33e8a0 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -701,6 +701,9 @@ bool CDVDVideoCodecDRMPRIME::FilterOpen(const std::string& filters, AVPixelForma
   if (filters.find("deinterlace") != std::string::npos && pix_fmt == AV_PIX_FMT_YUV420P)
      pix_fmt = AV_PIX_FMT_DRM_PRIME;
 
+  if (filters.find("bwdif") != std::string::npos && pix_fmt == AV_PIX_FMT_DRM_PRIME)
+     pix_fmt = AV_PIX_FMT_YUV420P;
+
   if (m_pFilterGraph)
     FilterClose();
 
@@ -867,6 +870,25 @@ CDVDVideoCodec::VCReturn CDVDVideoCodecDRMPRIME::ProcessFilterIn()
     m_pFrame->data[0] = reinterpret_cast<uint8_t*>(descriptor);
     m_pFrame->format = AV_PIX_FMT_DRM_PRIME;
   }
+  // hw decoded buffers submitted to sw decoder need mapping of planes for cpu to access
+  else if (m_pFrame->format == AV_PIX_FMT_DRM_PRIME && m_pFilterGraph && m_pFilterIn->outputs[0]->format == AV_PIX_FMT_YUV420P)
+  {
+    AVFrame *frame = av_frame_alloc();
+    frame->width = m_pFrame->width;
+    frame->height = m_pFrame->height;
+    frame->format = AV_PIX_FMT_YUV420P;
+    int ret = av_hwframe_map(frame, m_pFrame, (int)AV_HWFRAME_MAP_READ);
+    if (ret < 0)
+    {
+      char err[AV_ERROR_MAX_STRING_SIZE] = {};
+      av_strerror(ret, err, AV_ERROR_MAX_STRING_SIZE);
+      CLog::Log(LOGERROR, "CDVDVideoCodecDRMPRIME::{} - av_hwframe_map failed: {} ({})",
+                __FUNCTION__, err, ret);
+      return VC_ERROR;
+    }
+    av_frame_unref(m_pFrame);
+    av_frame_move_ref(m_pFrame, frame);
+  }
 
   int ret = av_buffersrc_add_frame(m_pFilterIn, m_pFrame);
   if (ret < 0)
-- 
2.34.1


From 56510bc7aac6c7b4a88bb952fbfabbbbe30df455 Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Tue, 20 Jun 2023 15:14:02 +0100
Subject: [PATCH 23/24] DVDVideoCodecDRMPRIME: Request v4l2 buffers be
 allocated through cache

This is an optional request, but will improve performance of sw deinterlace
if supported.
---
 .../VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 326d33e8a0..8c0d37bf59 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -367,6 +367,10 @@ bool CDVDVideoCodecDRMPRIME::Open(CDVDStreamInfo& hints, CDVDCodecOptions& optio
   for (auto&& option : options.m_keys)
     av_opt_set(m_pCodecContext, option.m_name.c_str(), option.m_value.c_str(), 0);
 
+  // this requests v4l2 buffers are allocated through cache. It will work if this is not supported,
+  // but subsequent operations like deinterlace may be less efficient
+  av_opt_set(m_pCodecContext->priv_data, "dmabuf_alloc", "cma", 0);
+
   if (avcodec_open2(m_pCodecContext, pCodec, nullptr) < 0)
   {
     CLog::Log(LOGINFO, "CDVDVideoCodecDRMPRIME::{} - unable to open codec", __FUNCTION__);
-- 
2.34.1


From a77c833ea78ab14259b298e60b86205a98099f87 Mon Sep 17 00:00:00 2001
From: Dom Cobley <popcornmix@gmail.com>
Date: Wed, 21 Jun 2023 13:16:01 +0100
Subject: [PATCH 24/24] DVDVideoCodecDRMPRIME: Add setting to enable hw
 deinterlace

HW deinterlace has lower cpu, but may have higher quality,
so allow user to choose appropriate setting.
---
 .../resource.language.en_gb/resources/strings.po | 11 +++++++++++
 system/settings/linux.xml                        | 12 ++++++++++++
 .../DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp    | 16 +++++++++++++++-
 xbmc/settings/Settings.h                         |  1 +
 4 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/addons/resource.language.en_gb/resources/strings.po b/addons/resource.language.en_gb/resources/strings.po
index 9fb3dcb2b0..c2d0f4352a 100644
--- a/addons/resource.language.en_gb/resources/strings.po
+++ b/addons/resource.language.en_gb/resources/strings.po
@@ -7363,6 +7363,11 @@ msgctxt "#13438"
 msgid "Allow hardware acceleration with DRM PRIME"
 msgstr ""
 
+#: system/settings/settings.xml
+msgctxt "#13500"
+msgid "Allow hardware deinterlace with DRM PRIME"
+msgstr ""
+
 #: system/settings/settings.xml
 msgctxt "#13439"
 msgid "Allow hardware acceleration - MediaCodec"
@@ -19550,6 +19555,12 @@ msgctxt "#36172"
 msgid "Enable PRIME decoding of video files"
 msgstr ""
 
+#. Description of setting with label #13500 "Allow hardware deinterlace - PRIME"
+#: system/settings/settings.xml
+msgctxt "#36290"
+msgid "Enable PRIME hardware deinterlace of video files"
+msgstr ""
+
 #. Description of setting with label #14109 "Short date format"
 #: system/settings/settings.xml
 msgctxt "#36173"
diff --git a/system/settings/linux.xml b/system/settings/linux.xml
index 89b91db23b..4cdb0982af 100644
--- a/system/settings/linux.xml
+++ b/system/settings/linux.xml
@@ -180,6 +180,18 @@
           <default>true</default>
           <control type="toggle" />
         </setting>
+        <setting id="videoplayer.primeallowhwdeinterlace" type="boolean" parent="videoplayer.useprimedecoder" label="13500" help="36290">
+          <requirement>HAS_GLES</requirement>
+          <visible>false</visible>
+          <dependencies>
+            <dependency type="enable">
+              <condition setting="videoplayer.useprimedecoder" operator="is">true</condition>
+            </dependency>
+          </dependencies>
+          <level>3</level>
+          <default>true</default>
+          <control type="toggle" />
+        </setting>
         <setting id="videoplayer.useprimerenderer" type="integer" label="13462" help="13463">
           <requirement>HAS_GLES</requirement>
           <visible>false</visible>
diff --git a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
index 8c0d37bf59..141f08d4fb 100644
--- a/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
+++ b/xbmc/cores/VideoPlayer/DVDCodecs/Video/DVDVideoCodecDRMPRIME.cpp
@@ -41,6 +41,7 @@ namespace
 {
 
 constexpr const char* SETTING_VIDEOPLAYER_USEPRIMEDECODERFORHW{"videoplayer.useprimedecoderforhw"};
+constexpr const char* SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE{"videoplayer.primeallowhwdeinterlace"};
 
 static void ReleaseBuffer(void* opaque, uint8_t* data)
 {
@@ -149,6 +150,15 @@ void CDVDVideoCodecDRMPRIME::Register()
 
   setting->SetVisible(true);
 
+  setting = settings->GetSetting(SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE);
+  if (!setting)
+  {
+    CLog::Log(LOGERROR, "Failed to load setting for: {}", SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE);
+    return;
+  }
+
+  setting->SetVisible(true);
+
   CDVDFactoryCodec::RegisterHWVideoCodec("drm_prime", CDVDVideoCodecDRMPRIME::Create);
 }
 
@@ -651,7 +661,11 @@ void CDVDVideoCodecDRMPRIME::FilterTest(AVPixelFormat pix_fmt)
   m_deintFilterName.clear();
 
   // look twice, first for DRM_PRIME support, then for actual pixel format
-  for (int i=0; i < 2; i++)
+
+  bool hw = CServiceBroker::GetSettingsComponent()->GetSettings()->GetBool(
+      SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE);
+
+  for (int i = hw ? 0 : 1; i < 2; i++)
   {
     const AVFilter* filter;
     void* opaque{};
diff --git a/xbmc/settings/Settings.h b/xbmc/settings/Settings.h
index bfc5e6072c..5e7ea6ff9e 100644
--- a/xbmc/settings/Settings.h
+++ b/xbmc/settings/Settings.h
@@ -123,6 +123,7 @@ public:
   static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODEC = "videoplayer.usemediacodec";
   static constexpr auto SETTING_VIDEOPLAYER_USEMEDIACODECSURFACE =
       "videoplayer.usemediacodecsurface";
+  static constexpr auto SETTING_VIDEOPLAYER_ALLOWHWDEINTERLACE = "videoplayer.primeallowhwdeinterlace";
   static constexpr auto SETTING_VIDEOPLAYER_USEVDPAU = "videoplayer.usevdpau";
   static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMIXER = "videoplayer.usevdpaumixer";
   static constexpr auto SETTING_VIDEOPLAYER_USEVDPAUMPEG2 = "videoplayer.usevdpaumpeg2";
-- 
2.34.1