Imported Upstream version 0.18.0

author: James Cowgill <james410@cowgill.org.uk> 2016-07-04 11:19:11 +0200
committer: James Cowgill <james410@cowgill.org.uk> 2016-07-04 11:19:11 +0200
commit: b3df5144ae0631b8634e535ba90245e8cdfd2a0a (patch)
tree: bc955df92f24b7140d3e0d4ec56edcfa74b32c5b /video
parent: 36e11d485bf132c7ae9cf5c3433ae40d63adb54d (diff)
112 files changed, 6336 insertions, 3482 deletions
diff --git a/video/csputils.c b/video/csputils.c
index 69d3b80..ffa1f82 100644
--- a/video/csputils.c
+++ b/video/csputils.c
@@ -77,6 +77,7 @@ const struct m_opt_choice_alternatives mp_csp_trc_names[] = {
     {"gamma2.2",    MP_CSP_TRC_GAMMA22},
     {"gamma2.8",    MP_CSP_TRC_GAMMA28},
     {"prophoto",    MP_CSP_TRC_PRO_PHOTO},
+    {"st2084",      MP_CSP_TRC_SMPTE_ST2084},
     {0}
 };
 
@@ -170,6 +171,9 @@ enum mp_csp_trc avcol_trc_to_mp_csp_trc(int avtrc)
     case AVCOL_TRC_LINEAR:       return MP_CSP_TRC_LINEAR;
     case AVCOL_TRC_GAMMA22:      return MP_CSP_TRC_GAMMA22;
     case AVCOL_TRC_GAMMA28:      return MP_CSP_TRC_GAMMA28;
+#if HAVE_AVUTIL_ST2084
+    case AVCOL_TRC_SMPTEST2084:  return MP_CSP_TRC_SMPTE_ST2084;
+#endif
     default:                     return MP_CSP_TRC_AUTO;
     }
 }
@@ -213,12 +217,15 @@ int mp_csp_trc_to_avcol_trc(enum mp_csp_trc trc)
 {
     switch (trc) {
     // We just call it BT.1886 since we're decoding, but it's still BT.709
-    case MP_CSP_TRC_BT_1886:     return AVCOL_TRC_BT709;
-    case MP_CSP_TRC_SRGB:        return AVCOL_TRC_IEC61966_2_1;
-    case MP_CSP_TRC_LINEAR:      return AVCOL_TRC_LINEAR;
-    case MP_CSP_TRC_GAMMA22:     return AVCOL_TRC_GAMMA22;
-    case MP_CSP_TRC_GAMMA28:     return AVCOL_TRC_GAMMA28;
-    default:                     return AVCOL_TRC_UNSPECIFIED;
+    case MP_CSP_TRC_BT_1886:      return AVCOL_TRC_BT709;
+    case MP_CSP_TRC_SRGB:         return AVCOL_TRC_IEC61966_2_1;
+    case MP_CSP_TRC_LINEAR:       return AVCOL_TRC_LINEAR;
+    case MP_CSP_TRC_GAMMA22:      return AVCOL_TRC_GAMMA22;
+    case MP_CSP_TRC_GAMMA28:      return AVCOL_TRC_GAMMA28;
+#if HAVE_AVUTIL_ST2084
+    case MP_CSP_TRC_SMPTE_ST2084: return AVCOL_TRC_SMPTEST2084;
+#endif
+    default:                      return AVCOL_TRC_UNSPECIFIED;
     }
 }
 
diff --git a/video/csputils.h b/video/csputils.h
index 1d8d3b1..19dd88f 100644
--- a/video/csputils.h
+++ b/video/csputils.h
@@ -78,6 +78,7 @@ enum mp_csp_trc {
     MP_CSP_TRC_GAMMA22,
     MP_CSP_TRC_GAMMA28,
     MP_CSP_TRC_PRO_PHOTO,
+    MP_CSP_TRC_SMPTE_ST2084,
     MP_CSP_TRC_COUNT
 };
 
diff --git a/video/d3d.h b/video/d3d.h
deleted file mode 100644
index 30bee49..0000000
--- a/video/d3d.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef MP_D3D_H_
-#define MP_D3D_H_
-
-#include <d3d9.h>
-
-#include "hwdec.h"
-
-struct mp_d3d_ctx {
-    struct mp_hwdec_ctx hwctx;
-    IDirect3DDevice9 *d3d9_device;
-};
-
-#endif
diff --git a/video/d3d11va.c b/video/d3d11va.c
deleted file mode 100644
index a9be571..0000000
--- a/video/d3d11va.c
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "mp_image.h"
-#include "d3d11va.h"
-
-struct d3d11va_surface {
-    HMODULE d3d11_dll;
-    ID3D11Texture2D              *texture;
-    ID3D11VideoDecoderOutputView *surface;
-};
-
-ID3D11VideoDecoderOutputView *d3d11_surface_in_mp_image(struct mp_image *mpi)
-{
-    return mpi && mpi->imgfmt == IMGFMT_D3D11VA ?
-        (ID3D11VideoDecoderOutputView *)mpi->planes[3] : NULL;
-}
-
-ID3D11Texture2D *d3d11_texture_in_mp_image(struct mp_image *mpi)
-{
-    if (!mpi || mpi->imgfmt != IMGFMT_D3D11VA)
-        return NULL;
-    struct d3d11va_surface *surface = (void *)mpi->planes[0];
-    return surface->texture;
-}
-
-static void d3d11va_release_img(void *arg)
-{
-    struct d3d11va_surface *surface = arg;
-    if (surface->surface)
-        ID3D11VideoDecoderOutputView_Release(surface->surface);
-
-    if (surface->texture)
-        ID3D11Texture2D_Release(surface->texture);
-
-    if (surface->d3d11_dll)
-        FreeLibrary(surface->d3d11_dll);
-
-    talloc_free(surface);
-}
-
-struct mp_image *d3d11va_new_ref(ID3D11VideoDecoderOutputView *view,
-                                 int w, int h)
-{
-    if (!view)
-        return NULL;
-    struct d3d11va_surface *surface = talloc_zero(NULL, struct d3d11va_surface);
-
-    surface->d3d11_dll = LoadLibrary(L"d3d11.dll");
-    if (!surface->d3d11_dll)
-        goto fail;
-
-    surface->surface = view;
-    ID3D11VideoDecoderOutputView_AddRef(surface->surface);
-    ID3D11VideoDecoderOutputView_GetResource(
-        surface->surface, (ID3D11Resource **)&surface->texture);
-
-    struct mp_image *mpi = mp_image_new_custom_ref(
-        &(struct mp_image){0}, surface, d3d11va_release_img);
-    if (!mpi)
-        abort();
-
-    mp_image_setfmt(mpi, IMGFMT_D3D11VA);
-    mp_image_set_size(mpi, w, h);
-    mpi->planes[0] = (void *)surface;
-    mpi->planes[3] = (void *)surface->surface;
-
-    return mpi;
-fail:
-    d3d11va_release_img(surface);
-    return NULL;
-}
diff --git a/video/d3d11va.h b/video/d3d11va.h
deleted file mode 100644
index db2f295..0000000
--- a/video/d3d11va.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MPV_D3D11_H
-#define MPV_D3D11_H
-
-#include <d3d11.h>
-
-struct mp_image;
-
-ID3D11VideoDecoderOutputView *d3d11_surface_in_mp_image(struct mp_image *mpi);
-ID3D11Texture2D              *d3d11_texture_in_mp_image(struct mp_image *mpi);
-struct mp_image *d3d11va_new_ref(ID3D11VideoDecoderOutputView *view,
-                                 int w, int h);
-
-#endif
diff --git a/video/decode/d3d.c b/video/decode/d3d.c
index 35d1af9..b978472 100644
--- a/video/decode/d3d.c
+++ b/video/decode/d3d.c
@@ -15,6 +15,8 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <pthread.h>
+
 #include <libavcodec/avcodec.h>
 
 #include "lavc.h"
@@ -48,7 +50,6 @@ DEFINE_GUID(DXVA2_ModeVP9_VLD_Profile0,         0x463707f8, 0xa1d0, 0x4585, 0x87
 
 DEFINE_GUID(DXVA2_NoEncrypt,                    0x1b81beD0, 0xa0c7, 0x11d3, 0xb9, 0x84, 0x00, 0xc0, 0x4f, 0x2e, 0x73, 0xc5);
 
-static const int PROF_MPEG2_SIMPLE[] = {FF_PROFILE_MPEG2_SIMPLE, 0};
 static const int PROF_MPEG2_MAIN[]   = {FF_PROFILE_MPEG2_SIMPLE,
                                         FF_PROFILE_MPEG2_MAIN, 0};
 static const int PROF_H264_HIGH[]    = {FF_PROFILE_H264_CONSTRAINED_BASELINE,
@@ -70,14 +71,14 @@ struct d3dva_mode {
 // Prefered modes must come first
 static const struct d3dva_mode d3dva_modes[] = {
     // MPEG-1/2
-    {MODE2(MPEG2_VLD),        AV_CODEC_ID_MPEG2VIDEO, PROF_MPEG2_SIMPLE},
+    {MODE2(MPEG2_VLD),        AV_CODEC_ID_MPEG2VIDEO, PROF_MPEG2_MAIN},
     {MODE2(MPEG2and1_VLD),    AV_CODEC_ID_MPEG2VIDEO, PROF_MPEG2_MAIN},
     {MODE2(MPEG2and1_VLD),    AV_CODEC_ID_MPEG1VIDEO},
 
     // H.264
     {MODE2(H264_F),                        AV_CODEC_ID_H264, PROF_H264_HIGH},
-    {MODE (Intel_H264_NoFGT_ClearVideo),   AV_CODEC_ID_H264, PROF_H264_HIGH},
     {MODE2(H264_E),                        AV_CODEC_ID_H264, PROF_H264_HIGH},
+    {MODE (Intel_H264_NoFGT_ClearVideo),   AV_CODEC_ID_H264, PROF_H264_HIGH},
     {MODE (ModeH264_VLD_WithFMOASO_NoFGT), AV_CODEC_ID_H264, PROF_H264_HIGH},
     {MODE (ModeH264_VLD_NoFGT_Flash),      AV_CODEC_ID_H264, PROF_H264_HIGH},
 
@@ -97,6 +98,22 @@ static const struct d3dva_mode d3dva_modes[] = {
 #undef MODE
 #undef MODE2
 
+HMODULE d3d11_dll, d3d9_dll, dxva2_dll;
+
+static pthread_once_t d3d_load_once = PTHREAD_ONCE_INIT;
+
+static void d3d_do_load(void)
+{
+    d3d11_dll = LoadLibrary(L"d3d11.dll");
+    d3d9_dll  = LoadLibrary(L"d3d9.dll");
+    dxva2_dll = LoadLibrary(L"dxva2.dll");
+}
+
+void d3d_load_dlls(void)
+{
+    pthread_once(&d3d_load_once, d3d_do_load);
+}
+
 int d3d_probe_codec(const char *codec)
 {
     enum AVCodecID codecid = mp_codec_to_av_codec_id(codec);
@@ -132,12 +149,13 @@ static bool mode_supported(const struct d3dva_mode *mode,
 
 struct d3d_decoder_fmt d3d_select_decoder_mode(
     struct lavc_ctx *s, const GUID *device_guids, UINT n_guids,
-    DWORD (*get_dxfmt_cb)(struct lavc_ctx *s, const GUID *guid, int depth))
+    const struct d3d_decoded_format *formats, int n_formats,
+    bool (*test_fmt_cb)(struct lavc_ctx *s, const GUID *guid,
+                        const struct d3d_decoded_format *fmt))
 {
     struct d3d_decoder_fmt fmt = {
-        .guid          = &GUID_NULL,
-        .mpfmt_decoded = IMGFMT_NONE,
-        .dxfmt_decoded = 0,
+        .guid   = &GUID_NULL,
+        .format = NULL,
     };
 
     // this has the right bit-depth, but is unfortunately not the native format
@@ -146,8 +164,6 @@ struct d3d_decoder_fmt d3d_select_decoder_mode(
         return fmt;
 
     int depth = IMGFMT_RGB_DEPTH(sw_img_fmt);
-    int p010  = mp_imgfmt_find(1, 1, 2, 10, MP_IMGFLAG_YUV_NV);
-    int mpfmt_decoded = depth <= 8 ? IMGFMT_NV12 : p010;
 
     for (int i = 0; i < MP_ARRAY_SIZE(d3dva_modes); i++) {
         const struct d3dva_mode *mode = &d3dva_modes[i];
@@ -155,12 +171,23 @@ struct d3d_decoder_fmt d3d_select_decoder_mode(
             profile_compatible(mode, s->avctx->profile) &&
             mode_supported(mode, device_guids, n_guids)) {
 
-            DWORD dxfmt_decoded = get_dxfmt_cb(s, mode->guid, depth);
-            if (dxfmt_decoded) {
-                fmt.guid          = mode->guid;
-                fmt.mpfmt_decoded = mpfmt_decoded;
-                fmt.dxfmt_decoded = dxfmt_decoded;
-                return fmt;
+            for (int n = 0; n < n_formats; n++) {
+                const struct d3d_decoded_format *format = &formats[n];
+
+                if (depth <= format->depth && test_fmt_cb(s, mode->guid, format))
+                {
+                    MP_VERBOSE(s, "Selecting %s ",
+                               d3d_decoder_guid_to_desc(mode->guid));
+                    if (format->dxfmt >= (1 << 16)) {
+                        MP_VERBOSE(s, "%s\n", mp_tag_str(format->dxfmt));
+                    } else {
+                        MP_VERBOSE(s, "%d\n", (int)format->dxfmt);
+                    }
+
+                    fmt.guid   = mode->guid;
+                    fmt.format = format;
+                    return fmt;
+                }
             }
         }
     }
diff --git a/video/decode/d3d.h b/video/decode/d3d.h
index bbd6bdf..15c423a 100644
--- a/video/decode/d3d.h
+++ b/video/decode/d3d.h
@@ -24,16 +24,31 @@
 struct mp_image;
 struct lavc_ctx;
 
+struct d3d_decoded_format {
+    DWORD       dxfmt;  // D3DFORMAT or DXGI_FORMAT
+    const char *name;   // informational string repr. of dxfmt_decoded
+    int         depth;  // significant bits (not full size)
+    int         mpfmt;  // IMGFMT_ with compatible memory layout and semantics
+};
+
 struct d3d_decoder_fmt {
     const GUID *guid;
-    int   mpfmt_decoded;
-    DWORD dxfmt_decoded; // D3DFORMAT or DXGI_FORMAT
+    const struct d3d_decoded_format *format;
 };
 
+// Must call d3d_load_dlls() before accessing. Once this is done, the DLLs
+// remain loaded forever.
+extern HMODULE d3d11_dll, d3d9_dll, dxva2_dll;
+
+void d3d_load_dlls(void);
+
 int d3d_probe_codec(const char *codec);
+
 struct d3d_decoder_fmt d3d_select_decoder_mode(
     struct lavc_ctx *s, const GUID *device_guids, UINT n_guids,
-    DWORD (*get_dxfmt_cb)(struct lavc_ctx *s, const GUID *guid, int depth));
+    const struct d3d_decoded_format *formats, int n_formats,
+    bool (*test_fmt_cb)(struct lavc_ctx *s, const GUID *guid,
+                        const struct d3d_decoded_format *fmt));
 
 char *d3d_decoder_guid_to_desc_buf(char *buf, size_t buf_size,
                                    const GUID *mode_guid);
diff --git a/video/decode/d3d11va.c b/video/decode/d3d11va.c
index 622a289..d929e1e 100644
--- a/video/decode/d3d11va.c
+++ b/video/decode/d3d11va.c
@@ -15,6 +15,7 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <initguid.h>
 #include <libavcodec/d3d11va.h>
 
 #include "lavc.h"
@@ -25,7 +26,6 @@
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
 
-#include "video/d3d11va.h"
 #include "d3d.h"
 
 #define ADDITIONAL_SURFACES (4 + HWDEC_DELAY_QUEUE_COUNT)
@@ -40,7 +40,6 @@ struct d3d11va_decoder {
 struct priv {
     struct mp_log *log;
 
-    HMODULE                 d3d11_dll;
     ID3D11Device           *device;
     ID3D11DeviceContext    *device_ctx;
     ID3D11VideoDevice      *video_dev;
@@ -50,6 +49,53 @@ struct priv {
     struct mp_image_pool   *sw_pool;
 };
 
+struct d3d11va_surface {
+    ID3D11Texture2D              *texture;
+    ID3D11VideoDecoderOutputView *surface;
+};
+
+static void d3d11va_release_img(void *arg)
+{
+    struct d3d11va_surface *surface = arg;
+    if (surface->surface)
+        ID3D11VideoDecoderOutputView_Release(surface->surface);
+
+    if (surface->texture)
+        ID3D11Texture2D_Release(surface->texture);
+
+    talloc_free(surface);
+}
+
+static struct mp_image *d3d11va_new_ref(ID3D11VideoDecoderOutputView *view,
+                                        int w, int h)
+{
+    if (!view)
+        return NULL;
+    struct d3d11va_surface *surface = talloc_zero(NULL, struct d3d11va_surface);
+
+    surface->surface = view;
+    ID3D11VideoDecoderOutputView_AddRef(surface->surface);
+    ID3D11VideoDecoderOutputView_GetResource(
+        surface->surface, (ID3D11Resource **)&surface->texture);
+
+    D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC surface_desc;
+    ID3D11VideoDecoderOutputView_GetDesc(surface->surface, &surface_desc);
+
+    struct mp_image *mpi =
+        mp_image_new_custom_ref(NULL, surface, d3d11va_release_img);
+    if (!mpi)
+        abort();
+
+    mp_image_setfmt(mpi, IMGFMT_D3D11VA);
+    mp_image_set_size(mpi, w, h);
+    mpi->planes[0] = NULL;
+    mpi->planes[1] = (void *)surface->texture;
+    mpi->planes[2] = (void *)(intptr_t)surface_desc.Texture2D.ArraySlice;
+    mpi->planes[3] = (void *)surface->surface;
+
+    return mpi;
+}
+
 static struct mp_image *d3d11va_allocate_image(struct lavc_ctx *s, int w, int h)
 {
     struct priv *p = s->hwdec_priv;
@@ -66,10 +112,14 @@ static struct mp_image *d3d11va_retrieve_image(struct lavc_ctx *s,
     HRESULT hr;
     struct priv *p = s->hwdec_priv;
     ID3D11Texture2D              *staging = p->decoder->staging;
-    ID3D11Texture2D              *texture = d3d11_texture_in_mp_image(img);
-    ID3D11VideoDecoderOutputView *surface = d3d11_surface_in_mp_image(img);
 
-    if (!texture || !surface) {
+    if (img->imgfmt != IMGFMT_D3D11VA)
+        return img;
+
+    ID3D11Texture2D *texture = (void *)img->planes[1];
+    int subindex = (intptr_t)img->planes[2];
+
+    if (!texture) {
         MP_ERR(p, "Failed to get Direct3D texture and surface from mp_image\n");
         return img;
     }
@@ -82,12 +132,10 @@ static struct mp_image *d3d11va_retrieve_image(struct lavc_ctx *s,
     }
 
     // copy to the staging texture
-    D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC surface_desc;
-    ID3D11VideoDecoderOutputView_GetDesc(surface, &surface_desc);
     ID3D11DeviceContext_CopySubresourceRegion(
         p->device_ctx,
         (ID3D11Resource *)staging, 0, 0, 0, 0,
-        (ID3D11Resource *)texture, surface_desc.Texture2D.ArraySlice, NULL);
+        (ID3D11Resource *)texture, subindex, NULL);
 
     struct mp_image *sw_img = mp_image_pool_get(p->sw_pool,
                                                 p->decoder->mpfmt_decoded,
@@ -117,27 +165,47 @@ static struct mp_image *d3d11va_retrieve_image(struct lavc_ctx *s,
     return sw_img;
 }
 
-struct d3d11_format {
-    DXGI_FORMAT format;
-    const char *name;
-    int         depth;
-};
-
 #define DFMT(name) MP_CONCAT(DXGI_FORMAT_, name), # name
-static const struct d3d11_format d3d11_formats[] = {
-    {DFMT(NV12),  8},
-    {DFMT(P010), 10},
-    {DFMT(P016), 16},
+static const struct d3d_decoded_format d3d11_formats[] = {
+    {DFMT(NV12),  8, IMGFMT_NV12},
+    {DFMT(P010), 10, IMGFMT_P010},
+    {DFMT(P016), 16, IMGFMT_P010},
 };
 #undef DFMT
 
-static BOOL d3d11_format_supported(struct lavc_ctx *s, const GUID *guid,
-                                   const struct d3d11_format *format)
+// Update hw_subfmt to the underlying format. Needed because AVFrame does not
+// have such an attribute, so it can't be passed through, and is updated here
+// instead. (But in the future, AVHWFramesContext could be used.)
+static struct mp_image *d3d11va_update_image_attribs(struct lavc_ctx *s,
+                                                     struct mp_image *img)
+{
+    ID3D11Texture2D *texture = (void *)img->planes[1];
+
+    if (!texture)
+        return img;
+
+    D3D11_TEXTURE2D_DESC texture_desc;
+    ID3D11Texture2D_GetDesc(texture, &texture_desc);
+    for (int n = 0; n < MP_ARRAY_SIZE(d3d11_formats); n++) {
+        if (d3d11_formats[n].dxfmt == texture_desc.Format) {
+            img->params.hw_subfmt = d3d11_formats[n].mpfmt;
+            break;
+        }
+    }
+
+    if (img->params.hw_subfmt == IMGFMT_NV12)
+        mp_image_setfmt(img, IMGFMT_D3D11NV12);
+
+    return img;
+}
+
+static bool d3d11_format_supported(struct lavc_ctx *s, const GUID *guid,
+                                   const struct d3d_decoded_format *format)
 {
     struct priv *p = s->hwdec_priv;
     BOOL is_supported = FALSE;
     HRESULT hr = ID3D11VideoDevice_CheckVideoDecoderFormat(
-        p->video_dev, guid, format->format, &is_supported);
+        p->video_dev, guid, format->dxfmt, &is_supported);
     if (FAILED(hr)) {
         MP_ERR(p, "Check decoder output format %s for decoder %s: %s\n",
                format->name, d3d_decoder_guid_to_desc(guid),
@@ -151,25 +219,13 @@ static void dump_decoder_info(struct lavc_ctx *s, const GUID *guid)
     struct priv *p = s->hwdec_priv;
     char fmts[256] = {0};
     for (int i = 0; i < MP_ARRAY_SIZE(d3d11_formats); i++) {
-        const struct d3d11_format *format = &d3d11_formats[i];
+        const struct d3d_decoded_format *format = &d3d11_formats[i];
         if (d3d11_format_supported(s, guid, format))
             mp_snprintf_cat(fmts, sizeof(fmts), " %s", format->name);
     }
     MP_VERBOSE(p, "%s %s\n", d3d_decoder_guid_to_desc(guid), fmts);
 }
 
-static DWORD get_dxfmt_cb(struct lavc_ctx *s, const GUID *guid, int depth)
-{
-    for (int i = 0; i < MP_ARRAY_SIZE(d3d11_formats); i++) {
-        const struct d3d11_format *format = &d3d11_formats[i];
-        if (depth <= format->depth &&
-            d3d11_format_supported(s, guid, format)) {
-            return format->format;
-        }
-    }
-    return 0;
-}
-
 static void d3d11va_destroy_decoder(void *arg)
 {
     struct d3d11va_decoder *decoder = arg;
@@ -188,6 +244,7 @@ static int d3d11va_init_decoder(struct lavc_ctx *s, int w, int h)
     struct priv *p = s->hwdec_priv;
     TA_FREEP(&p->decoder);
 
+    ID3D11Texture2D *texture = NULL;
     void *tmp = talloc_new(NULL);
 
     UINT n_guids = ID3D11VideoDevice_GetVideoDecoderProfileCount(p->video_dev);
@@ -204,31 +261,32 @@ static int d3d11va_init_decoder(struct lavc_ctx *s, int w, int h)
     }
 
     struct d3d_decoder_fmt fmt =
-        d3d_select_decoder_mode(s, device_guids, n_guids, get_dxfmt_cb);
-    if (fmt.mpfmt_decoded == IMGFMT_NONE) {
+        d3d_select_decoder_mode(s, device_guids, n_guids,
+                                d3d11_formats, MP_ARRAY_SIZE(d3d11_formats),
+                                d3d11_format_supported);
+    if (!fmt.format) {
         MP_ERR(p, "Failed to find a suitable decoder\n");
         goto done;
     }
 
     struct d3d11va_decoder *decoder = talloc_zero(tmp, struct d3d11va_decoder);
     talloc_set_destructor(decoder, d3d11va_destroy_decoder);
-    decoder->mpfmt_decoded = fmt.mpfmt_decoded;
+    decoder->mpfmt_decoded = fmt.format->mpfmt;
 
     int n_surfaces = hwdec_get_max_refs(s) + ADDITIONAL_SURFACES;
     int w_align = w, h_align = h;
     d3d_surface_align(s, &w_align, &h_align);
 
-    ID3D11Texture2D *texture = NULL;
     D3D11_TEXTURE2D_DESC tex_desc = {
         .Width            = w_align,
         .Height           = h_align,
         .MipLevels        = 1,
-        .Format           = fmt.dxfmt_decoded,
+        .Format           = fmt.format->dxfmt,
         .SampleDesc.Count = 1,
         .MiscFlags        = 0,
         .ArraySize        = n_surfaces,
         .Usage            = D3D11_USAGE_DEFAULT,
-        .BindFlags        = D3D11_BIND_DECODER,
+        .BindFlags        = D3D11_BIND_DECODER | D3D11_BIND_SHADER_RESOURCE,
         .CPUAccessFlags   = 0,
     };
     hr = ID3D11Device_CreateTexture2D(p->device, &tex_desc, NULL, &texture);
@@ -290,7 +348,7 @@ static int d3d11va_init_decoder(struct lavc_ctx *s, int w, int h)
         .Guid         = *fmt.guid,
         .SampleWidth  = w,
         .SampleHeight = h,
-        .OutputFormat = fmt.dxfmt_decoded,
+        .OutputFormat = fmt.format->dxfmt,
     };
     UINT n_cfg;
     hr = ID3D11VideoDevice_GetVideoDecoderConfigCount(p->video_dev,
@@ -365,9 +423,6 @@ static void destroy_device(struct lavc_ctx *s)
 
     if (p->device_ctx)
         ID3D11DeviceContext_Release(p->device_ctx);
-
-    if (p->d3d11_dll)
-        FreeLibrary(p->d3d11_dll);
 }
 
 static bool create_device(struct lavc_ctx *s, BOOL thread_safe)
@@ -375,14 +430,14 @@ static bool create_device(struct lavc_ctx *s, BOOL thread_safe)
     HRESULT hr;
     struct priv *p = s->hwdec_priv;
 
-    p->d3d11_dll = LoadLibrary(L"d3d11.dll");
-    if (!p->d3d11_dll) {
+    d3d_load_dlls();
+    if (!d3d11_dll) {
         MP_ERR(p, "Failed to load D3D11 library\n");
         return false;
     }
 
     PFN_D3D11_CREATE_DEVICE CreateDevice =
-        (void *)GetProcAddress(p->d3d11_dll, "D3D11CreateDevice");
+        (void *)GetProcAddress(d3d11_dll, "D3D11CreateDevice");
     if (!CreateDevice) {
         MP_ERR(p, "Failed to get D3D11CreateDevice symbol from DLL: %s\n",
                mp_LastError_to_str());
@@ -445,8 +500,20 @@ static int d3d11va_init(struct lavc_ctx *s)
         p->sw_pool = talloc_steal(p, mp_image_pool_new(17));
     }
 
-    if (!create_device(s, FALSE))
+    p->device = hwdec_devices_load(s->hwdec_devs, s->hwdec->type);
+    if (p->device) {
+        ID3D11Device_AddRef(p->device);
+        ID3D11Device_GetImmediateContext(p->device, &p->device_ctx);
+        if (!p->device_ctx)
+            goto fail;
+        MP_VERBOSE(p, "Using VO-supplied device %p.\n", p->device);
+    } else if (s->hwdec->type == HWDEC_D3D11VA) {
+        MP_ERR(p, "No Direct3D device provided for native d3d11 decoding\n");
         goto fail;
+    } else {
+        if (!create_device(s, FALSE))
+            goto fail;
+    }
 
     hr = ID3D11DeviceContext_QueryInterface(p->device_ctx,
                                             &IID_ID3D11VideoContext,
@@ -478,16 +545,31 @@ fail:
     return -1;
 }
 
-static int d3d11va_probe(struct vd_lavc_hwdec *hwdec,
-                         struct mp_hwdec_info *info,
+static int d3d11va_probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                          const char *codec)
 {
-    hwdec_request_api(info, "d3d11va");
+    // d3d11va-copy can do without external context; dxva2 requires it.
+    if (hwdec->type != HWDEC_D3D11VA_COPY) {
+        if (!hwdec_devices_load(ctx->hwdec_devs, HWDEC_D3D11VA))
+            return HWDEC_ERR_NO_CTX;
+    }
     return d3d_probe_codec(codec);
 }
 
+const struct vd_lavc_hwdec mp_vd_lavc_d3d11va = {
+    .type           = HWDEC_D3D11VA,
+    .image_format   = IMGFMT_D3D11VA,
+    .probe          = d3d11va_probe,
+    .init           = d3d11va_init,
+    .uninit         = d3d11va_uninit,
+    .init_decoder   = d3d11va_init_decoder,
+    .allocate_image = d3d11va_allocate_image,
+    .process_image  = d3d11va_update_image_attribs,
+};
+
 const struct vd_lavc_hwdec mp_vd_lavc_d3d11va_copy = {
     .type           = HWDEC_D3D11VA_COPY,
+    .copying        = true,
     .image_format   = IMGFMT_D3D11VA,
     .probe          = d3d11va_probe,
     .init           = d3d11va_init,
diff --git a/video/decode/dec_video.h b/video/decode/dec_video.h
index f4646a9..1030973 100644
--- a/video/decode/dec_video.h
+++ b/video/decode/dec_video.h
@@ -32,7 +32,7 @@ struct dec_video {
     struct mpv_global *global;
     struct MPOpts *opts;
     const struct vd_functions *vd_driver;
-    struct mp_hwdec_info *hwdec_info; // video output hwdec handles
+    struct mp_hwdec_devices *hwdec_devs; // video output hwdec handles
     struct sh_stream *header;
     struct mp_codec_params *codec;
 
diff --git a/video/decode/dxva2.c b/video/decode/dxva2.c
index c90fa76..fc52aca 100644
--- a/video/decode/dxva2.c
+++ b/video/decode/dxva2.c
@@ -19,6 +19,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#include <initguid.h>
+
 #define DXVA2API_USE_BITFIELDS
 #include <libavcodec/dxva2.h>
 
@@ -30,8 +32,6 @@
 #include "video/mp_image_pool.h"
 #include "video/hwdec.h"
 
-#include "video/d3d.h"
-#include "video/dxva2.h"
 #include "d3d.h"
 
 #define ADDITIONAL_SURFACES (4 + HWDEC_DELAY_QUEUE_COUNT)
@@ -39,8 +39,6 @@
 struct priv {
     struct mp_log *log;
 
-    HMODULE                      d3d9_dll;
-    HMODULE                      dxva2_dll;
     IDirect3D9                  *d3d9;
     IDirect3DDevice9            *device;
     HANDLE                       device_handle;
@@ -52,6 +50,47 @@ struct priv {
     int                          mpfmt_decoded;
 };
 
+struct dxva2_surface {
+    IDirectXVideoDecoder *decoder;
+    IDirect3DSurface9    *surface;
+};
+
+static void dxva2_release_img(void *arg)
+{
+    struct dxva2_surface *surface = arg;
+    if (surface->surface)
+        IDirect3DSurface9_Release(surface->surface);
+
+    if (surface->decoder)
+        IDirectXVideoDecoder_Release(surface->decoder);
+
+    talloc_free(surface);
+}
+
+static struct mp_image *dxva2_new_ref(IDirectXVideoDecoder *decoder,
+                                      IDirect3DSurface9 *d3d9_surface,
+                                      int w, int h)
+{
+    if (!decoder || !d3d9_surface)
+        return NULL;
+    struct dxva2_surface *surface = talloc_zero(NULL, struct dxva2_surface);
+
+    surface->surface = d3d9_surface;
+    IDirect3DSurface9_AddRef(surface->surface);
+    surface->decoder = decoder;
+    IDirectXVideoDecoder_AddRef(surface->decoder);
+
+    struct mp_image *mpi =
+        mp_image_new_custom_ref(NULL, surface, dxva2_release_img);
+    if (!mpi)
+        abort();
+
+    mp_image_setfmt(mpi, IMGFMT_DXVA2);
+    mp_image_set_size(mpi, w, h);
+    mpi->planes[3] = (void *)surface->surface;
+    return mpi;
+}
+
 static struct mp_image *dxva2_allocate_image(struct lavc_ctx *s, int w, int h)
 {
     struct priv *p = s->hwdec_priv;
@@ -67,7 +106,8 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
 {
     HRESULT hr;
     struct priv *p = s->hwdec_priv;
-    IDirect3DSurface9 *surface = d3d9_surface_in_mp_image(img);
+    IDirect3DSurface9 *surface = img->imgfmt == IMGFMT_DXVA2 ?
+        (IDirect3DSurface9 *)img->planes[3] : NULL;
 
     if (!surface) {
         MP_ERR(p, "Failed to get Direct3D surface from mp_image\n");
@@ -108,15 +148,10 @@ static struct mp_image *dxva2_retrieve_image(struct lavc_ctx *s,
     return sw_img;
 }
 
-struct d3d9_format {
-    D3DFORMAT format;
-    int       depth;
-};
-
-static const struct d3d9_format d3d9_formats[] = {
-    {MKTAG('N','V','1','2'),  8},
-    {MKTAG('P','0','1','0'), 10},
-    {MKTAG('P','0','1','6'), 16},
+static const struct d3d_decoded_format d3d9_formats[] = {
+    {MKTAG('N','V','1','2'), "NV12", 8,  IMGFMT_NV12},
+    {MKTAG('P','0','1','0'), "P010", 10, IMGFMT_P010},
+    {MKTAG('P','0','1','6'), "P016", 16, IMGFMT_P010},
 };
 
 static void dump_decoder_info(struct lavc_ctx *s,
@@ -133,7 +168,7 @@ static void dump_decoder_info(struct lavc_ctx *s,
         HRESULT hr = IDirectXVideoDecoderService_GetDecoderRenderTargets(
             p->decoder_service, guid, &n_formats, &formats);
         if (FAILED(hr)) {
-            MP_ERR(p, "Failed to get render targets for decoder %s:%s",
+            MP_ERR(p, "Failed to get render targets for decoder %s:%s\n",
                    description, mp_HRESULT_to_str(hr));
         }
 
@@ -148,9 +183,10 @@ static void dump_decoder_info(struct lavc_ctx *s,
     }
 }
 
-static DWORD get_dxfmt_cb(struct lavc_ctx *s, const GUID *guid, int depth)
+static bool dxva2_format_supported(struct lavc_ctx *s, const GUID *guid,
+                                   const struct d3d_decoded_format *format)
 {
-    DWORD ret = 0;
+    bool ret = false;
     struct priv *p = s->hwdec_priv;
     D3DFORMAT *formats = NULL;
     UINT     n_formats = 0;
@@ -162,19 +198,12 @@ static DWORD get_dxfmt_cb(struct lavc_ctx *s, const GUID *guid, int depth)
         return 0;
     }
 
-    for (int i = 0; i < MP_ARRAY_SIZE(d3d9_formats); i++) {
-        const struct d3d9_format *d3d9_fmt = &d3d9_formats[i];
-        if (d3d9_fmt->depth < depth)
-            continue;
-
-        for (UINT j = 0; j < n_formats; j++) {
-            if (formats[i] == d3d9_fmt->format) {
-                ret = formats[i];
-                goto done;
-            }
-        }
+    for (int i = 0; i < n_formats; i++) {
+        ret = formats[i] == format->dxfmt;
+        if (ret)
+            break;
     }
-done:
+
     CoTaskMemFree(formats);
     return ret;
 }
@@ -204,14 +233,16 @@ static int dxva2_init_decoder(struct lavc_ctx *s, int w, int h)
     dump_decoder_info(s, device_guids, n_guids);
 
     struct d3d_decoder_fmt fmt =
-        d3d_select_decoder_mode(s, device_guids, n_guids, get_dxfmt_cb);
+        d3d_select_decoder_mode(s, device_guids, n_guids,
+                                d3d9_formats, MP_ARRAY_SIZE(d3d9_formats),
+                                dxva2_format_supported);
     CoTaskMemFree(device_guids);
-    if (fmt.mpfmt_decoded == IMGFMT_NONE) {
+    if (!fmt.format) {
         MP_ERR(p, "Failed to find a suitable decoder\n");
         goto done;
     }
 
-    p->mpfmt_decoded = fmt.mpfmt_decoded;
+    p->mpfmt_decoded = fmt.format->mpfmt;
     struct mp_image_pool *decoder_pool =
         talloc_steal(tmp, mp_image_pool_new(n_surfaces));
     DXVA2_ConfigPictureDecode *decoder_config =
@@ -222,7 +253,7 @@ static int dxva2_init_decoder(struct lavc_ctx *s, int w, int h)
     DXVA2_VideoDesc video_desc ={
         .SampleWidth  = w,
         .SampleHeight = h,
-        .Format       = fmt.dxfmt_decoded,
+        .Format       = fmt.format->dxfmt,
     };
     UINT                     n_configs  = 0;
     DXVA2_ConfigPictureDecode *configs = NULL;
@@ -255,7 +286,7 @@ static int dxva2_init_decoder(struct lavc_ctx *s, int w, int h)
     hr = IDirectXVideoDecoderService_CreateSurface(
         p->decoder_service,
         w_align, h_align,
-        n_surfaces - 1, fmt.dxfmt_decoded, D3DPOOL_DEFAULT, 0,
+        n_surfaces - 1, fmt.format->dxfmt, D3DPOOL_DEFAULT, 0,
         DXVA2_VideoDecoderRenderTarget, surfaces, NULL);
     if (FAILED(hr)) {
         MP_ERR(p, "Failed to create %d video surfaces: %s\n",
@@ -316,25 +347,20 @@ static void destroy_device(struct lavc_ctx *s)
 
     if (p->d3d9)
         IDirect3D9_Release(p->d3d9);
-
-    if (p->d3d9_dll)
-        FreeLibrary(p->d3d9_dll);
-
-    if (p->dxva2_dll)
-        FreeLibrary(p->dxva2_dll);
 }
 
 static bool create_device(struct lavc_ctx *s)
 {
     struct priv *p = s->hwdec_priv;
-    p->d3d9_dll = LoadLibrary(L"d3d9.dll");
-    if (!p->d3d9_dll) {
+
+    d3d_load_dlls();
+    if (!d3d9_dll) {
         MP_ERR(p, "Failed to load D3D9 library\n");
         return false;
     }
 
     IDirect3D9* (WINAPI *Direct3DCreate9)(UINT) =
-        (void *)GetProcAddress(p->d3d9_dll, "Direct3DCreate9");
+        (void *)GetProcAddress(d3d9_dll, "Direct3DCreate9");
     if (!Direct3DCreate9) {
         MP_ERR(p, "Failed to locate Direct3DCreate9\n");
         return false;
@@ -413,9 +439,7 @@ static int dxva2_init(struct lavc_ctx *s)
         p->sw_pool = talloc_steal(p, mp_image_pool_new(17));
     }
 
-    if (s->hwdec_info && s->hwdec_info->hwctx && s->hwdec_info->hwctx->d3d_ctx)
-        p->device = s->hwdec_info->hwctx->d3d_ctx->d3d9_device;
-
+    p->device = hwdec_devices_load(s->hwdec_devs, s->hwdec->type);
     if (p->device) {
         IDirect3D9_AddRef(p->device);
         MP_VERBOSE(p, "Using VO-supplied device %p.\n", p->device);
@@ -427,15 +451,14 @@ static int dxva2_init(struct lavc_ctx *s)
             goto fail;
     }
 
-    p->dxva2_dll = LoadLibrary(L"dxva2.dll");
-    if (!p->dxva2_dll) {
+    d3d_load_dlls();
+    if (!dxva2_dll) {
         MP_ERR(p, "Failed to load DXVA2 library\n");
         goto fail;
     }
 
     HRESULT (WINAPI *CreateDeviceManager9)(UINT *, IDirect3DDeviceManager9 **) =
-        (void *)GetProcAddress(p->dxva2_dll,
-                               "DXVA2CreateDirect3DDeviceManager9");
+        (void *)GetProcAddress(dxva2_dll, "DXVA2CreateDirect3DDeviceManager9");
     if (!CreateDeviceManager9) {
         MP_ERR(p, "Failed to locate DXVA2CreateDirect3DDeviceManager9\n");
         goto fail;
@@ -484,15 +507,15 @@ fail:
     return -1;
 }
 
-static int dxva2_probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+static int dxva2_probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                        const char *codec)
 {
-    hwdec_request_api(info, "dxva2");
     // dxva2-copy can do without external context; dxva2 requires it.
-    if (hwdec->type != HWDEC_DXVA2_COPY) {
-        if (!info || !info->hwctx || !info->hwctx->d3d_ctx ||
-            info->hwctx->type == HWDEC_DXVA2_COPY)
+    if (hwdec->type == HWDEC_DXVA2) {
+        if (!hwdec_devices_load(ctx->hwdec_devs, HWDEC_DXVA2))
             return HWDEC_ERR_NO_CTX;
+    } else {
+        hwdec_devices_load(ctx->hwdec_devs, HWDEC_DXVA2_COPY);
     }
     return d3d_probe_codec(codec);
 }
@@ -509,6 +532,7 @@ const struct vd_lavc_hwdec mp_vd_lavc_dxva2 = {
 
 const struct vd_lavc_hwdec mp_vd_lavc_dxva2_copy = {
     .type           = HWDEC_DXVA2_COPY,
+    .copying        = true,
     .image_format   = IMGFMT_DXVA2,
     .probe          = dxva2_probe,
     .init           = dxva2_init,
diff --git a/video/decode/lavc.h b/video/decode/lavc.h
index 826edbf..689222d 100644
--- a/video/decode/lavc.h
+++ b/video/decode/lavc.h
@@ -30,7 +30,7 @@ typedef struct lavc_ctx {
     int max_delay_queue;
 
     // From VO
-    struct mp_hwdec_info *hwdec_info;
+    struct mp_hwdec_devices *hwdec_devs;
 
     // For free use by hwdec implementation
     void *hwdec_priv;
@@ -49,12 +49,14 @@ struct vd_lavc_hwdec {
     // If not-0: the IMGFMT_ format that should be accepted in the libavcodec
     // get_format callback.
     int image_format;
+    // Always returns a non-hwaccel image format.
+    bool copying;
     // Setting this will queue the given number of frames before calling
     // process_image() or returning them to the renderer. This can increase
     // efficiency by not blocking on the hardware pipeline by reading back
     // immediately after decoding.
     int delay_queue;
-    int (*probe)(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+    int (*probe)(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                  const char *codec);
     int (*init)(struct lavc_ctx *ctx);
     int (*init_decoder)(struct lavc_ctx *ctx, int w, int h);
@@ -69,6 +71,10 @@ struct vd_lavc_hwdec {
     void (*unlock)(struct lavc_ctx *ctx);
     // Optional; if a special hardware decoder is needed (instead of "hwaccel").
     const char *(*get_codec)(struct lavc_ctx *ctx, const char *codec);
+    // Suffix for libavcodec decoder. If non-NULL, get_codec() is overridden
+    // with hwdec_find_decoder.
+    // Intuitively, this will force the corresponding wrapper decoder.
+    const char *lavc_suffix;
 };
 
 enum {
@@ -89,4 +95,6 @@ bool hwdec_check_codec_support(const char *codec,
                                const struct hwdec_profile_entry *table);
 int hwdec_get_max_refs(struct lavc_ctx *ctx);
 
+const char *hwdec_find_decoder(const char *codec, const char *suffix);
+
 #endif
diff --git a/video/decode/mediacodec.c b/video/decode/mediacodec.c
deleted file mode 100644
index 37ce1b8..0000000
--- a/video/decode/mediacodec.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "lavc.h"
-#include "common/common.h"
-
-static const char *const codecs[][2] = {
-    {"h264",        "h264_mediacodec"},
-    {0}
-};
-
-static const char *map_codec(const char *c)
-{
-    for (int n = 0; codecs[n][0]; n++) {
-        if (c && strcmp(codecs[n][0], c) == 0)
-            return codecs[n][1];
-    }
-    return NULL;
-}
-
-static int init_decoder(struct lavc_ctx *ctx, int w, int h)
-{
-    return 0;
-}
-
-static void uninit(struct lavc_ctx *ctx)
-{
-}
-
-static int init(struct lavc_ctx *ctx)
-{
-    return 0;
-}
-
-static int probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
-                 const char *decoder)
-{
-    return map_codec(decoder) ? 0 : HWDEC_ERR_NO_CODEC;
-}
-
-static const char *get_codec(struct lavc_ctx *ctx, const char *codec)
-{
-    return map_codec(codec);
-}
-
-const struct vd_lavc_hwdec mp_vd_lavc_mediacodec = {
-    .type = HWDEC_MEDIACODEC,
-    .image_format = IMGFMT_NV12,
-    .probe = probe,
-    .init = init,
-    .uninit = uninit,
-    .init_decoder = init_decoder,
-    .get_codec = get_codec,
-};
diff --git a/video/decode/rpi.c b/video/decode/rpi.c
deleted file mode 100644
index f2ed6d2..0000000
--- a/video/decode/rpi.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "lavc.h"
-#include "common/common.h"
-
-static const char *const codecs[][2] = {
-    {"h264",        "h264_mmal"},
-    {"mpeg2video",  "mpeg2_mmal"},
-    {"mpeg4",       "mpeg4_mmal"},
-    {"vc1",         "vc1_mmal"},
-    {0}
-};
-
-static const char *map_codec(const char *c)
-{
-    for (int n = 0; codecs[n][0]; n++) {
-        if (c && strcmp(codecs[n][0], c) == 0)
-            return codecs[n][1];
-    }
-    return NULL;
-}
-
-static int init_decoder(struct lavc_ctx *ctx, int w, int h)
-{
-    return 0;
-}
-
-static void uninit(struct lavc_ctx *ctx)
-{
-}
-
-static int init(struct lavc_ctx *ctx)
-{
-    return 0;
-}
-
-static int probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
-                 const char *codec)
-{
-    return map_codec(codec) ? 0 : HWDEC_ERR_NO_CODEC;
-}
-
-static const char *get_codec(struct lavc_ctx *ctx, const char *codec)
-{
-    return map_codec(codec);
-}
-
-const struct vd_lavc_hwdec mp_vd_lavc_rpi = {
-    .type = HWDEC_RPI,
-    .image_format = IMGFMT_MMAL,
-    .probe = probe,
-    .init = init,
-    .uninit = uninit,
-    .init_decoder = init_decoder,
-    .get_codec = get_codec,
-};
diff --git a/video/decode/vaapi.c b/video/decode/vaapi.c
index 2682225..aa8291d 100644
--- a/video/decode/vaapi.c
+++ b/video/decode/vaapi.c
@@ -72,15 +72,6 @@ struct va_native_display {
     void (*destroy)(struct priv *p);
 };
 
-static const struct va_native_display disp_x11;
-
-static const struct va_native_display *const native_displays[] = {
-#if HAVE_VAAPI_X11
-    &disp_x11,
-#endif
-    NULL
-};
-
 #if HAVE_VAAPI_X11
 #include <X11/Xlib.h>
 #include <va/va_x11.h>
@@ -108,6 +99,13 @@ static const struct va_native_display disp_x11 = {
 };
 #endif
 
+static const struct va_native_display *const native_displays[] = {
+#if HAVE_VAAPI_X11
+    &disp_x11,
+#endif
+    NULL
+};
+
 #define HAS_HEVC VA_CHECK_VERSION(0, 38, 0)
 #define HAS_VP9 (VA_CHECK_VERSION(0, 38, 1) && defined(FF_PROFILE_VP9_0))
 
@@ -340,6 +338,12 @@ static struct mp_image *allocate_image(struct lavc_ctx *ctx, int w, int h)
     return img;
 }
 
+static struct mp_image *update_format(struct lavc_ctx *ctx, struct mp_image *img)
+{
+    va_surface_init_subformat(img);
+    return img;
+}
+
 static void destroy_va_dummy_ctx(struct priv *p)
 {
     va_destroy(p->ctx);
@@ -351,7 +355,7 @@ static void destroy_va_dummy_ctx(struct priv *p)
 
 // Creates a "private" VADisplay, disconnected from the VO. We just create a
 // new X connection, because that's simpler. (We could also pass the X
-// connection along with struct mp_hwdec_info, if we wanted.)
+// connection along with struct mp_hwdec_devices, if we wanted.)
 static bool create_va_dummy_ctx(struct priv *p)
 {
     for (int n = 0; native_displays[n]; n++) {
@@ -393,21 +397,23 @@ static void uninit(struct lavc_ctx *ctx)
     ctx->hwdec_priv = NULL;
 }
 
-static int init_with_vactx(struct lavc_ctx *ctx, struct mp_vaapi_ctx *vactx)
+static int init(struct lavc_ctx *ctx, bool direct)
 {
     struct priv *p = talloc_ptrtype(NULL, p);
     *p = (struct priv) {
         .log = mp_log_new(p, ctx->log, "vaapi"),
-        .ctx = vactx,
         .va_context = &p->va_context_storage,
         .rt_format = VA_RT_FORMAT_YUV420
     };
 
-    if (!p->ctx)
+    if (direct) {
+        p->ctx = hwdec_devices_get(ctx->hwdec_devs, HWDEC_VAAPI)->ctx;
+    } else {
         create_va_dummy_ctx(p);
-    if (!p->ctx) {
-        talloc_free(p);
-        return -1;
+        if (!p->ctx) {
+            talloc_free(p);
+            return -1;
+        }
     }
 
     p->display = p->ctx->display;
@@ -425,25 +431,22 @@ static int init_with_vactx(struct lavc_ctx *ctx, struct mp_vaapi_ctx *vactx)
     return 0;
 }
 
-static int init(struct lavc_ctx *ctx)
+static int init_direct(struct lavc_ctx *ctx)
 {
-    return init_with_vactx(ctx, ctx->hwdec_info->hwctx->vaapi_ctx);
+    return init(ctx, true);
 }
 
-static int probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+static int probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                  const char *codec)
 {
-    hwdec_request_api(info, "vaapi");
-    if (!info || !info->hwctx || !info->hwctx->vaapi_ctx)
+    if (!hwdec_devices_load(ctx->hwdec_devs, HWDEC_VAAPI))
         return HWDEC_ERR_NO_CTX;
     if (!hwdec_check_codec_support(codec, profiles))
         return HWDEC_ERR_NO_CODEC;
-    if (va_guess_if_emulated(info->hwctx->vaapi_ctx))
-        return HWDEC_ERR_EMULATED;
     return 0;
 }
 
-static int probe_copy(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+static int probe_copy(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                       const char *codec)
 {
     struct priv dummy = {mp_null_log};
@@ -460,7 +463,7 @@ static int probe_copy(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
 
 static int init_copy(struct lavc_ctx *ctx)
 {
-    return init_with_vactx(ctx, NULL);
+    return init(ctx, false);
 }
 
 static struct mp_image *copy_image(struct lavc_ctx *ctx, struct mp_image *img)
@@ -491,16 +494,18 @@ const struct vd_lavc_hwdec mp_vd_lavc_vaapi = {
     .type = HWDEC_VAAPI,
     .image_format = IMGFMT_VAAPI,
     .probe = probe,
-    .init = init,
+    .init = init_direct,
     .uninit = uninit,
     .init_decoder = init_decoder,
     .allocate_image = allocate_image,
     .lock = intel_shit_lock,
     .unlock = intel_crap_unlock,
+    .process_image = update_format,
 };
 
 const struct vd_lavc_hwdec mp_vd_lavc_vaapi_copy = {
     .type = HWDEC_VAAPI_COPY,
+    .copying = true,
     .image_format = IMGFMT_VAAPI,
     .probe = probe_copy,
     .init = init_copy,
diff --git a/video/decode/vd_lavc.c b/video/decode/vd_lavc.c
index a444f88..fbb04d1 100644
--- a/video/decode/vd_lavc.c
+++ b/video/decode/vd_lavc.c
@@ -126,9 +126,20 @@ extern const struct vd_lavc_hwdec mp_vd_lavc_vaapi;
 extern const struct vd_lavc_hwdec mp_vd_lavc_vaapi_copy;
 extern const struct vd_lavc_hwdec mp_vd_lavc_dxva2;
 extern const struct vd_lavc_hwdec mp_vd_lavc_dxva2_copy;
+extern const struct vd_lavc_hwdec mp_vd_lavc_d3d11va;
 extern const struct vd_lavc_hwdec mp_vd_lavc_d3d11va_copy;
-extern const struct vd_lavc_hwdec mp_vd_lavc_rpi;
-extern const struct vd_lavc_hwdec mp_vd_lavc_mediacodec;
+
+static const struct vd_lavc_hwdec mp_vd_lavc_rpi = {
+    .type = HWDEC_RPI,
+    .lavc_suffix = "_mmal",
+    .image_format = IMGFMT_MMAL,
+};
+
+static const struct vd_lavc_hwdec mp_vd_lavc_mediacodec = {
+    .type = HWDEC_MEDIACODEC,
+    .lavc_suffix = "_mediacodec",
+    .copying = true,
+};
 
 static const struct vd_lavc_hwdec *const hwdec_list[] = {
 #if HAVE_RPI
@@ -144,11 +155,10 @@ static const struct vd_lavc_hwdec *const hwdec_list[] = {
     &mp_vd_lavc_vaapi,
     &mp_vd_lavc_vaapi_copy,
 #endif
-#if HAVE_DXVA2_HWACCEL
+#if HAVE_D3D_HWACCEL
+    &mp_vd_lavc_d3d11va,
     &mp_vd_lavc_dxva2,
     &mp_vd_lavc_dxva2_copy,
-#endif
-#if HAVE_D3D11VA_HWACCEL
     &mp_vd_lavc_d3d11va_copy,
 #endif
 #if HAVE_ANDROID
@@ -233,18 +243,51 @@ int hwdec_get_max_refs(struct lavc_ctx *ctx)
     return 2;
 }
 
-void hwdec_request_api(struct mp_hwdec_info *info, const char *api_name)
+// This is intended to return the name of a decoder for a given wrapper API.
+// Decoder wrappers are usually added to libavcodec with a specific suffix.
+// For example the mmal h264 decoder is named h264_mmal.
+// This API would e.g. return h264_mmal for
+// hwdec_find_decoder("h264", "_mmal").
+// Just concatenating the two names will not always work due to inconsistencies
+// (e.g. "mpeg2video" vs. "mpeg2").
+const char *hwdec_find_decoder(const char *codec, const char *suffix)
+{
+    enum AVCodecID codec_id = mp_codec_to_av_codec_id(codec);
+    if (codec_id == AV_CODEC_ID_NONE)
+        return NULL;
+    AVCodec *cur = NULL;
+    for (;;) {
+        cur = av_codec_next(cur);
+        if (!cur)
+            break;
+        if (cur->id == codec_id && av_codec_is_decoder(cur) &&
+            bstr_endswith0(bstr0(cur->name), suffix))
+            return cur->name;
+    }
+    return NULL;
+}
+
+// Parallel to hwdec_find_decoder(): return whether a hwdec can use the given
+// decoder. This can't be answered accurately; it works for wrapper decoders
+// only (like mmal), and for real hwaccels this will always return false.
+static bool hwdec_is_wrapper(struct vd_lavc_hwdec *hwdec, const char *decoder)
 {
-    if (info && info->load_api)
-        info->load_api(info, api_name);
+    if (!hwdec->lavc_suffix)
+        return false;
+    return bstr_endswith0(bstr0(decoder), hwdec->lavc_suffix);
 }
 
-static int hwdec_probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+static int hwdec_probe(struct dec_video *vd, struct vd_lavc_hwdec *hwdec,
                        const char *codec)
 {
+    vd_ffmpeg_ctx *ctx = vd->priv;
     int r = 0;
     if (hwdec->probe)
-        r = hwdec->probe(hwdec, info, codec);
+        r = hwdec->probe(ctx, hwdec, codec);
+    if (r >= 0) {
+        if (hwdec->lavc_suffix && !hwdec_find_decoder(codec, hwdec->lavc_suffix))
+            return HWDEC_ERR_NO_CODEC;
+    }
     return r;
 }
 
@@ -258,7 +301,7 @@ static struct vd_lavc_hwdec *probe_hwdec(struct dec_video *vd, bool autoprobe,
         MP_VERBOSE(vd, "Requested hardware decoder not compiled.\n");
         return NULL;
     }
-    int r = hwdec_probe(hwdec, vd->hwdec_info, codec);
+    int r = hwdec_probe(vd, hwdec, codec);
     if (r == HWDEC_ERR_EMULATED) {
         if (autoprobe)
             return NULL;
@@ -284,17 +327,14 @@ static void uninit(struct dec_video *vd)
     talloc_free(vd->priv);
 }
 
-static bool force_fallback(struct dec_video *vd)
+static void force_fallback(struct dec_video *vd)
 {
     vd_ffmpeg_ctx *ctx = vd->priv;
-    if (!ctx->hwdec)
-        return false;
 
     uninit_avctx(vd);
     int lev = ctx->hwdec_notified ? MSGL_WARN : MSGL_V;
     mp_msg(vd->log, lev, "Falling back to software decoding.\n");
     init_avctx(vd, ctx->decoder, NULL);
-    return true;
 }
 
 static void reinit(struct dec_video *vd)
@@ -308,14 +348,38 @@ static void reinit(struct dec_video *vd)
     struct vd_lavc_hwdec *hwdec = NULL;
 
     if (hwdec_codec_allowed(vd, codec)) {
-        if (vd->opts->hwdec_api == HWDEC_AUTO) {
+        int api = vd->opts->hwdec_api;
+        if (HWDEC_IS_AUTO(api)) {
+            // If a specific decoder is forced, we should try a hwdec method
+            // that works with it, instead of simply failing later at runtime.
+            // This is good for avoiding trying "normal" hwaccels on wrapper
+            // decoders (like vaapi on a mmal decoder). Since libavcodec doesn't
+            // tell us which decoder supports which hwaccel methods without
+            // actually running it, do it by detecting such wrapper decoders.
+            // On the other hand, e.g. "--hwdec=rpi" should always force the
+            // wrapper decoder, so be careful not to break this case.
+            bool might_be_wrapper = false;
+            for (int n = 0; hwdec_list[n]; n++) {
+                struct vd_lavc_hwdec *other = (void *)hwdec_list[n];
+                if (hwdec_is_wrapper(other, decoder))
+                    might_be_wrapper = true;
+            }
             for (int n = 0; hwdec_list[n]; n++) {
                 hwdec = probe_hwdec(vd, true, hwdec_list[n]->type, codec);
-                if (hwdec)
+                if (hwdec) {
+                    if (might_be_wrapper && !hwdec_is_wrapper(hwdec, decoder)) {
+                        MP_VERBOSE(vd, "This hwaccel is not compatible.\n");
+                        continue;
+                    }
+                    if (api == HWDEC_AUTO_COPY && !hwdec->copying) {
+                        MP_VERBOSE(vd, "Not using this for auto-copy mode.\n");
+                        continue;
+                    }
                     break;
+                }
             }
-        } else if (vd->opts->hwdec_api != HWDEC_NONE) {
-            hwdec = probe_hwdec(vd, false, vd->opts->hwdec_api, codec);
+        } else if (api != HWDEC_NONE) {
+            hwdec = probe_hwdec(vd, false, api, codec);
         }
     } else {
         MP_VERBOSE(vd, "Not trying to use hardware decoding: codec %s is not "
@@ -326,13 +390,15 @@ static void reinit(struct dec_video *vd)
     if (hwdec) {
         if (hwdec->get_codec)
             decoder = hwdec->get_codec(ctx, decoder);
+        if (hwdec->lavc_suffix)
+            decoder = hwdec_find_decoder(codec, hwdec->lavc_suffix);
         MP_VERBOSE(vd, "Trying hardware decoding.\n");
     } else {
         MP_VERBOSE(vd, "Using software decoding.\n");
     }
 
     init_avctx(vd, decoder, hwdec);
-    if (!ctx->avctx)
+    if (!ctx->avctx && hwdec)
         force_fallback(vd);
 }
 
@@ -343,6 +409,7 @@ static int init(struct dec_video *vd, const char *decoder)
     ctx->log = vd->log;
     ctx->opts = vd->opts;
     ctx->decoder = talloc_strdup(ctx, decoder);
+    ctx->hwdec_devs = vd->hwdec_devs;
 
     reinit(vd);
 
@@ -372,8 +439,6 @@ static void init_avctx(struct dec_video *vd, const char *decoder,
     if (!lavc_codec)
         return;
 
-    ctx->hwdec_info = vd->hwdec_info;
-
     ctx->codec_timebase = (AVRational){0};
     if (strstr(decoder, "_mmal") || strstr(decoder, "_mediacodec"))
         ctx->codec_timebase = (AVRational){1, 1000000};
@@ -389,17 +454,21 @@ static void init_avctx(struct dec_video *vd, const char *decoder,
     avctx->codec_type = AVMEDIA_TYPE_VIDEO;
     avctx->codec_id = lavc_codec->id;
 
+    if (ctx->codec_timebase.num)
+        avctx->time_base = ctx->codec_timebase;
+
     avctx->refcounted_frames = 1;
     ctx->pic = av_frame_alloc();
     if (!ctx->pic)
         goto error;
 
     if (ctx->hwdec) {
-        avctx->thread_count    = 1;
-        avctx->get_format      = get_format_hwdec;
+        avctx->thread_count = 1;
+        if (ctx->hwdec->image_format)
+            avctx->get_format = get_format_hwdec;
         if (ctx->hwdec->allocate_image)
             avctx->get_buffer2 = get_buffer2_hwdec;
-        if (ctx->hwdec->init(ctx) < 0)
+        if (ctx->hwdec->init && ctx->hwdec->init(ctx) < 0)
             goto error;
         ctx->max_delay_queue = ctx->hwdec->delay_queue;
     } else {
@@ -409,14 +478,8 @@ static void init_avctx(struct dec_video *vd, const char *decoder,
     avctx->flags |= lavc_param->bitexact ? CODEC_FLAG_BITEXACT : 0;
     avctx->flags2 |= lavc_param->fast ? CODEC_FLAG2_FAST : 0;
 
-    if (lavc_param->show_all) {
-#ifdef CODEC_FLAG2_SHOW_ALL
-        avctx->flags2 |= CODEC_FLAG2_SHOW_ALL; // ffmpeg only?
-#endif
-#ifdef CODEC_FLAG_OUTPUT_CORRUPT
-        avctx->flags |= CODEC_FLAG_OUTPUT_CORRUPT; // added with Libav 10
-#endif
-    }
+    if (lavc_param->show_all)
+        avctx->flags |= CODEC_FLAG_OUTPUT_CORRUPT;
 
     avctx->skip_loop_filter = lavc_param->skip_loop_filter;
     avctx->skip_idct = lavc_param->skip_idct;
@@ -551,31 +614,29 @@ static enum AVPixelFormat get_format_hwdec(struct AVCodecContext *avctx,
     ctx->hwdec_request_reinit |= ctx->hwdec_failed;
     ctx->hwdec_failed = false;
 
-    if (ctx->hwdec->image_format) {
-        for (int i = 0; fmt[i] != AV_PIX_FMT_NONE; i++) {
-            if (ctx->hwdec->image_format == pixfmt2imgfmt(fmt[i])) {
-                // There could be more reasons for a change, and it's possible
-                // that we miss some. (Might also depend on the hwaccel type.)
-                bool change =
-                    ctx->hwdec_w != avctx->coded_width ||
-                    ctx->hwdec_h != avctx->coded_height ||
-                    ctx->hwdec_fmt != ctx->hwdec->image_format ||
-                    ctx->hwdec_profile != avctx->profile ||
-                    ctx->hwdec_request_reinit;
-                ctx->hwdec_w = avctx->coded_width;
-                ctx->hwdec_h = avctx->coded_height;
-                ctx->hwdec_fmt = ctx->hwdec->image_format;
-                ctx->hwdec_profile = avctx->profile;
-                ctx->hwdec_request_reinit = false;
-                if (change) {
-                    if (ctx->hwdec->init_decoder(ctx, ctx->hwdec_w, ctx->hwdec_h) < 0)
-                    {
-                        ctx->hwdec_fmt = 0;
-                        break;
-                    }
+    for (int i = 0; fmt[i] != AV_PIX_FMT_NONE; i++) {
+        if (ctx->hwdec->image_format == pixfmt2imgfmt(fmt[i])) {
+            // There could be more reasons for a change, and it's possible
+            // that we miss some. (Might also depend on the hwaccel type.)
+            bool change =
+                ctx->hwdec_w != avctx->coded_width ||
+                ctx->hwdec_h != avctx->coded_height ||
+                ctx->hwdec_fmt != ctx->hwdec->image_format ||
+                ctx->hwdec_profile != avctx->profile ||
+                ctx->hwdec_request_reinit;
+            ctx->hwdec_w = avctx->coded_width;
+            ctx->hwdec_h = avctx->coded_height;
+            ctx->hwdec_fmt = ctx->hwdec->image_format;
+            ctx->hwdec_profile = avctx->profile;
+            ctx->hwdec_request_reinit = false;
+            if (change && ctx->hwdec->init_decoder) {
+                if (ctx->hwdec->init_decoder(ctx, ctx->hwdec_w, ctx->hwdec_h) < 0)
+                {
+                    ctx->hwdec_fmt = 0;
+                    break;
                 }
-                return fmt[i];
             }
+            return fmt[i];
         }
     }
 
@@ -640,7 +701,7 @@ static struct mp_image *read_output(struct dec_video *vd)
     if (ctx->hwdec && ctx->hwdec->process_image)
         res = ctx->hwdec->process_image(ctx, res);
 
-    return mp_img_swap_to_native(res);
+    return res ? mp_img_swap_to_native(res) : NULL;
 }
 
 static void decode(struct dec_video *vd, struct demux_packet *packet,
@@ -701,7 +762,9 @@ static void decode(struct dec_video *vd, struct demux_packet *packet,
         MP_WARN(vd, "Error while decoding frame!\n");
         if (ctx->hwdec) {
             ctx->hwdec_fail_count += 1;
-            if (ctx->hwdec_fail_count >= opts->software_fallback)
+            // The FFmpeg VT hwaccel is buggy and can crash after 1 broken frame.
+            bool vt = ctx->hwdec && ctx->hwdec->type == HWDEC_VIDEOTOOLBOX;
+            if (ctx->hwdec_fail_count >= opts->software_fallback || vt)
                 ctx->hwdec_failed = true;
         }
         if (!ctx->hwdec_failed && packet)
@@ -767,7 +830,8 @@ static struct mp_image *decode_with_fallback(struct dec_video *vd,
     decode(vd, packet, flags, &mpi);
     if (ctx->hwdec_failed) {
         // Failed hardware decoding? Try again in software.
-        if (force_fallback(vd) && ctx->avctx)
+        force_fallback(vd);
+        if (ctx->avctx)
             decode(vd, packet, flags, &mpi);
     }
 
@@ -805,8 +869,10 @@ static int control(struct dec_video *vd, int cmd, void *arg)
         return CONTROL_TRUE;
     }
     case VDCTRL_FORCE_HWDEC_FALLBACK:
-        if (force_fallback(vd))
+        if (ctx->hwdec) {
+            force_fallback(vd);
             return ctx->avctx ? CONTROL_OK : CONTROL_ERROR;
+        }
         return CONTROL_FALSE;
     case VDCTRL_REINIT:
         reinit(vd);
diff --git a/video/decode/vdpau.c b/video/decode/vdpau.c
index 313fabf..0003182 100644
--- a/video/decode/vdpau.c
+++ b/video/decode/vdpau.c
@@ -61,6 +61,17 @@ static struct mp_image *allocate_image(struct lavc_ctx *ctx, int w, int h)
     return mp_vdpau_get_video_surface(p->mpvdp, chroma, s_w, s_h);
 }
 
+static struct mp_image *update_format(struct lavc_ctx *ctx, struct mp_image *img)
+{
+    VdpChromaType chroma = 0;
+    uint32_t s_w, s_h;
+    if (av_vdpau_get_surface_parameters(ctx->avctx, &chroma, &s_w, &s_h) >= 0) {
+        if (chroma == VDP_CHROMA_TYPE_420)
+            img->params.hw_subfmt = IMGFMT_NV12;
+    }
+    return img;
+}
+
 static void uninit(struct lavc_ctx *ctx)
 {
     struct priv *p = ctx->hwdec_priv;
@@ -75,7 +86,7 @@ static int init(struct lavc_ctx *ctx)
     struct priv *p = talloc_ptrtype(NULL, p);
     *p = (struct priv) {
         .log = mp_log_new(p, ctx->log, "vdpau"),
-        .mpvdp = ctx->hwdec_info->hwctx->vdpau_ctx,
+        .mpvdp = hwdec_devices_get(ctx->hwdec_devs, HWDEC_VDPAU)->ctx,
     };
     ctx->hwdec_priv = p;
 
@@ -83,14 +94,11 @@ static int init(struct lavc_ctx *ctx)
     return 0;
 }
 
-static int probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+static int probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                  const char *codec)
 {
-    hwdec_request_api(info, "vdpau");
-    if (!info || !info->hwctx || !info->hwctx->vdpau_ctx)
+    if (!hwdec_devices_load(ctx->hwdec_devs, HWDEC_VDPAU))
         return HWDEC_ERR_NO_CTX;
-    if (mp_vdpau_guess_if_emulated(info->hwctx->vdpau_ctx))
-        return HWDEC_ERR_EMULATED;
     return 0;
 }
 
@@ -102,4 +110,5 @@ const struct vd_lavc_hwdec mp_vd_lavc_vdpau = {
     .uninit = uninit,
     .init_decoder = init_decoder,
     .allocate_image = allocate_image,
+    .process_image = update_format,
 };
diff --git a/video/decode/videotoolbox.c b/video/decode/videotoolbox.c
index 2d2f5f7..c69d5e8 100644
--- a/video/decode/videotoolbox.c
+++ b/video/decode/videotoolbox.c
@@ -27,11 +27,10 @@
 #include "config.h"
 
 
-static int probe(struct vd_lavc_hwdec *hwdec, struct mp_hwdec_info *info,
+static int probe(struct lavc_ctx *ctx, struct vd_lavc_hwdec *hwdec,
                  const char *codec)
 {
-    hwdec_request_api(info, "videotoolbox");
-    if (!info || !info->hwctx || !info->hwctx->get_vt_fmt)
+    if (!hwdec_devices_load(ctx->hwdec_devs, HWDEC_VIDEOTOOLBOX))
         return HWDEC_ERR_NO_CTX;
     switch (mp_codec_to_av_codec_id(codec)) {
     case AV_CODEC_ID_H264:
@@ -89,8 +88,8 @@ static int init_decoder(struct lavc_ctx *ctx, int w, int h)
 
     AVVideotoolboxContext *vtctx = av_videotoolbox_alloc_context();
 
-    struct mp_hwdec_ctx *hwctx = ctx->hwdec_info->hwctx;
-    vtctx->cv_pix_fmt_type = hwctx->get_vt_fmt(hwctx);
+    struct mp_vt_ctx *vt = hwdec_devices_load(ctx->hwdec_devs, HWDEC_VIDEOTOOLBOX);
+    vtctx->cv_pix_fmt_type = vt->get_vt_fmt(vt);
 
     int err = av_videotoolbox_default_init2(ctx->avctx, vtctx);
     if (err < 0) {
diff --git a/video/dxva2.c b/video/dxva2.c
deleted file mode 100644
index d6635ce..0000000
--- a/video/dxva2.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "common/av_common.h"
-#include "dxva2.h"
-#include "mp_image.h"
-#include "img_format.h"
-#include "mp_image_pool.h"
-
-struct dxva2_surface {
-    HMODULE d3dlib;
-    HMODULE dxva2lib;
-
-    IDirectXVideoDecoder *decoder;
-    IDirect3DSurface9    *surface;
-};
-
-IDirect3DSurface9 *d3d9_surface_in_mp_image(struct mp_image *mpi)
-{
-    return mpi && mpi->imgfmt == IMGFMT_DXVA2 ?
-        (IDirect3DSurface9 *)mpi->planes[3] : NULL;
-}
-
-static void dxva2_release_img(void *arg)
-{
-    struct dxva2_surface *surface = arg;
-    if (surface->surface)
-        IDirect3DSurface9_Release(surface->surface);
-
-    if (surface->decoder)
-        IDirectXVideoDecoder_Release(surface->decoder);
-
-    if (surface->dxva2lib)
-        FreeLibrary(surface->dxva2lib);
-
-    if (surface->d3dlib)
-        FreeLibrary(surface->d3dlib);
-
-    talloc_free(surface);
-}
-
-struct mp_image *dxva2_new_ref(IDirectXVideoDecoder *decoder,
-                               IDirect3DSurface9 *d3d9_surface, int w, int h)
-{
-    if (!decoder || !d3d9_surface)
-        return NULL;
-    struct dxva2_surface *surface = talloc_zero(NULL, struct dxva2_surface);
-
-    // Add additional references to the libraries which might otherwise be freed
-    // before the surface, which is observed to lead to bad behaviour
-    surface->d3dlib   = LoadLibrary(L"d3d9.dll");
-    surface->dxva2lib = LoadLibrary(L"dxva2.dll");
-    if (!surface->d3dlib || !surface->dxva2lib)
-        goto fail;
-
-    surface->surface = d3d9_surface;
-    IDirect3DSurface9_AddRef(surface->surface);
-    surface->decoder = decoder;
-    IDirectXVideoDecoder_AddRef(surface->decoder);
-
-    struct mp_image *mpi = mp_image_new_custom_ref(&(struct mp_image){0},
-                                                   surface, dxva2_release_img);
-    if (!mpi)
-        abort();
-
-    mp_image_setfmt(mpi, IMGFMT_DXVA2);
-    mp_image_set_size(mpi, w, h);
-    mpi->planes[3] = (void *)surface->surface;
-    return mpi;
-fail:
-    dxva2_release_img(surface);
-    return NULL;
-}
diff --git a/video/dxva2.h b/video/dxva2.h
deleted file mode 100644
index 1f2e4a4..0000000
--- a/video/dxva2.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MPV_DXVA2_H
-#define MPV_DXVA2_H
-
-#include <d3d9.h>
-#include <dxva2api.h>
-
-struct mp_image;
-struct mp_image_pool;
-
-IDirect3DSurface9 *d3d9_surface_in_mp_image(struct mp_image *mpi);
-
-struct mp_image *dxva2_new_ref(IDirectXVideoDecoder *decoder,
-                               IDirect3DSurface9 *d3d9_surface, int w, int h);
-
-#endif
diff --git a/video/filter/refqueue.c b/video/filter/refqueue.c
new file mode 100644
index 0000000..04de312
--- /dev/null
+++ b/video/filter/refqueue.c
@@ -0,0 +1,230 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+
+#include "common/common.h"
+#include "video/mp_image.h"
+
+#include "refqueue.h"
+
+struct mp_refqueue {
+    int needed_past_frames;
+    int needed_future_frames;
+    int flags;
+
+    bool second_field; // current frame has to output a second field yet
+    bool eof;
+
+    // Queue of input frames, used to determine past/current/future frames.
+    // queue[0] is the newest frame, queue[num_queue - 1] the oldest.
+    struct mp_image **queue;
+    int num_queue;
+    // queue[pos] is the current frame, unless pos is an invalid index.
+    int pos;
+};
+
+struct mp_refqueue *mp_refqueue_alloc(void)
+{
+    struct mp_refqueue *q = talloc_zero(NULL, struct mp_refqueue);
+    mp_refqueue_flush(q);
+    return q;
+}
+
+void mp_refqueue_free(struct mp_refqueue *q)
+{
+    mp_refqueue_flush(q);
+    talloc_free(q);
+}
+
+// The minimum number of frames required before and after the current frame.
+void mp_refqueue_set_refs(struct mp_refqueue *q, int past, int future)
+{
+    assert(past >= 0 && future >= 0);
+    q->needed_past_frames = past;
+    q->needed_future_frames = MPMAX(future, 1); // at least 1 for determining PTS
+}
+
+// MP_MODE_* flags
+void mp_refqueue_set_mode(struct mp_refqueue *q, int flags)
+{
+    q->flags = flags;
+}
+
+// Whether the current frame should be deinterlaced.
+bool mp_refqueue_should_deint(struct mp_refqueue *q)
+{
+    if (!mp_refqueue_has_output(q) || !(q->flags & MP_MODE_DEINT))
+        return false;
+
+    return (q->queue[q->pos]->fields & MP_IMGFIELD_INTERLACED) ||
+           !(q->flags & MP_MODE_INTERLACED_ONLY);
+}
+
+// Whether the current output frame is marked as interlaced.
+bool mp_refqueue_is_interlaced(struct mp_refqueue *q)
+{
+    if (!mp_refqueue_has_output(q))
+        return false;
+
+    return q->queue[q->pos]->fields & MP_IMGFIELD_INTERLACED;
+}
+
+// Whether the current output frame (field) is the top field, bottom field
+// otherwise. (Assumes the caller forces deinterlacing.)
+bool mp_refqueue_is_top_field(struct mp_refqueue *q)
+{
+    if (!mp_refqueue_has_output(q))
+        return false;
+
+    return !!(q->queue[q->pos]->fields & MP_IMGFIELD_TOP_FIRST) ^ q->second_field;
+}
+
+// Whether top-field-first mode is enabled.
+bool mp_refqueue_top_field_first(struct mp_refqueue *q)
+{
+    if (!mp_refqueue_has_output(q))
+        return false;
+
+    return q->queue[q->pos]->fields & MP_IMGFIELD_TOP_FIRST;
+}
+
+// Discard all state.
+void mp_refqueue_flush(struct mp_refqueue *q)
+{
+    for (int n = 0; n < q->num_queue; n++)
+        talloc_free(q->queue[n]);
+    q->num_queue = 0;
+    q->pos = -1;
+    q->second_field = false;
+    q->eof = false;
+}
+
+// Add a new frame to the queue. (Call mp_refqueue_next() to advance the
+// current frame and to discard unneeded past frames.)
+// Ownership goes to the mp_refqueue.
+// Passing NULL means EOF, in which case mp_refqueue_need_input() will return
+// false even if not enough future frames are available.
+void mp_refqueue_add_input(struct mp_refqueue *q, struct mp_image *img)
+{
+    q->eof = !img;
+    if (!img)
+        return;
+
+    MP_TARRAY_INSERT_AT(q, q->queue, q->num_queue, 0, img);
+    q->pos++;
+
+    assert(q->pos >= 0 && q->pos < q->num_queue);
+}
+
+bool mp_refqueue_need_input(struct mp_refqueue *q)
+{
+    return q->pos < q->needed_future_frames && !q->eof;
+}
+
+bool mp_refqueue_has_output(struct mp_refqueue *q)
+{
+    return q->pos >= 0 && !mp_refqueue_need_input(q);
+}
+
+static bool output_next_field(struct mp_refqueue *q)
+{
+    if (q->second_field)
+        return false;
+    if (!(q->flags & MP_MODE_OUTPUT_FIELDS))
+        return false;
+    if (!mp_refqueue_should_deint(q))
+        return false;
+
+    assert(q->pos >= 0);
+
+    // If there's no (reasonable) timestamp, also skip the field.
+    if (q->pos == 0)
+        return false;
+
+    double pts = q->queue[q->pos]->pts;
+    double next_pts = q->queue[q->pos - 1]->pts;
+    if (pts == MP_NOPTS_VALUE || next_pts == MP_NOPTS_VALUE)
+        return false;
+
+    double frametime = next_pts - pts;
+    if (frametime <= 0.0 || frametime >= 1.0)
+        return false;
+
+    q->queue[q->pos]->pts = pts + frametime / 2;
+    q->second_field = true;
+    return true;
+}
+
+// Advance current field, depending on interlace flags.
+void mp_refqueue_next_field(struct mp_refqueue *q)
+{
+    if (!mp_refqueue_has_output(q))
+        return;
+
+    if (!output_next_field(q))
+        mp_refqueue_next(q);
+}
+
+// Advance to next input frame (skips fields even in field output mode).
+void mp_refqueue_next(struct mp_refqueue *q)
+{
+    if (!mp_refqueue_has_output(q))
+        return;
+
+    q->pos--;
+    q->second_field = false;
+
+    assert(q->pos >= -1 && q->pos < q->num_queue);
+
+    // Discard unneeded past frames.
+    while (q->num_queue - (q->pos + 1) > q->needed_past_frames) {
+        assert(q->num_queue > 0);
+        talloc_free(q->queue[q->num_queue - 1]);
+        q->num_queue--;
+    }
+
+    assert(q->pos >= -1 && q->pos < q->num_queue);
+}
+
+// Return a frame by relative position:
+//  -1: first past frame
+//   0: current frame
+//   1: first future frame
+// Caller doesn't get ownership. Return NULL if unavailable.
+struct mp_image *mp_refqueue_get(struct mp_refqueue *q, int pos)
+{
+    int i = q->pos - pos;
+    return i >= 0 && i < q->num_queue ? q->queue[i] : NULL;
+}
+
+// Same as mp_refqueue_get(), but return the frame which contains a field
+// relative to the current field's position.
+struct mp_image *mp_refqueue_get_field(struct mp_refqueue *q, int pos)
+{
+    // If the current field is the second field (conceptually), then pos=1
+    // needs to get the next frame. Similarly, pos=-1 needs to get the current
+    // frame, so round towards negative infinity.
+    int round = mp_refqueue_top_field_first(q) != mp_refqueue_is_top_field(q);
+    int frame = (pos < 0 ? pos - (1 - round) : pos + round) / 2;
+    return mp_refqueue_get(q, frame);
+}
+
+bool mp_refqueue_is_second_field(struct mp_refqueue *q)
+{
+    return mp_refqueue_has_output(q) && q->second_field;
+}
diff --git a/video/filter/refqueue.h b/video/filter/refqueue.h
new file mode 100644
index 0000000..ef23bee
--- /dev/null
+++ b/video/filter/refqueue.h
@@ -0,0 +1,36 @@
+#ifndef MP_REFQUEUE_H_
+#define MP_REFQUEUE_H_
+
+#include <stdbool.h>
+
+// A helper for deinterlacers which require past/future reference frames.
+
+struct mp_refqueue;
+
+struct mp_refqueue *mp_refqueue_alloc(void);
+void mp_refqueue_free(struct mp_refqueue *q);
+
+void mp_refqueue_set_refs(struct mp_refqueue *q, int past, int future);
+void mp_refqueue_flush(struct mp_refqueue *q);
+void mp_refqueue_add_input(struct mp_refqueue *q, struct mp_image *img);
+bool mp_refqueue_need_input(struct mp_refqueue *q);
+bool mp_refqueue_has_output(struct mp_refqueue *q);
+void mp_refqueue_next(struct mp_refqueue *q);
+void mp_refqueue_next_field(struct mp_refqueue *q);
+struct mp_image *mp_refqueue_get(struct mp_refqueue *q, int pos);
+
+enum {
+    MP_MODE_DEINT = (1 << 0),           // deinterlacing enabled
+    MP_MODE_OUTPUT_FIELDS = (1 << 1),   // output fields separately
+    MP_MODE_INTERLACED_ONLY = (1 << 2), // only deinterlace marked frames
+};
+
+void mp_refqueue_set_mode(struct mp_refqueue *q, int flags);
+bool mp_refqueue_should_deint(struct mp_refqueue *q);
+bool mp_refqueue_is_interlaced(struct mp_refqueue *q);
+bool mp_refqueue_is_top_field(struct mp_refqueue *q);
+bool mp_refqueue_top_field_first(struct mp_refqueue *q);
+bool mp_refqueue_is_second_field(struct mp_refqueue *q);
+struct mp_image *mp_refqueue_get_field(struct mp_refqueue *q, int pos);
+
+#endif
diff --git a/video/filter/vf.c b/video/filter/vf.c
index d8e7f6b..176ac95 100644
--- a/video/filter/vf.c
+++ b/video/filter/vf.c
@@ -61,6 +61,7 @@ extern const vf_info_t vf_info_vapoursynth_lazy;
 extern const vf_info_t vf_info_vdpaupp;
 extern const vf_info_t vf_info_vdpaurb;
 extern const vf_info_t vf_info_buffer;
+extern const vf_info_t vf_info_d3d11vpp;
 
 // list of available filters:
 static const vf_info_t *const filter_list[] = {
@@ -99,6 +100,9 @@ static const vf_info_t *const filter_list[] = {
     &vf_info_vdpaupp,
     &vf_info_vdpaurb,
 #endif
+#if HAVE_D3D_HWACCEL
+    &vf_info_d3d11vpp,
+#endif
     NULL
 };
 
@@ -244,7 +248,7 @@ static struct vf_instance *vf_open(struct vf_chain *c, const char *name,
     *vf = (vf_instance_t) {
         .info = desc.p,
         .log = mp_log_new(vf, c->log, name),
-        .hwdec = c->hwdec,
+        .hwdec_devs = c->hwdec_devs,
         .query_format = vf_default_query_format,
         .out_pool = talloc_steal(vf, mp_image_pool_new(16)),
         .chain = c,
@@ -514,7 +518,23 @@ static void query_formats(uint8_t *fmts, struct vf_instance *vf)
 
 static bool is_conv_filter(struct vf_instance *vf)
 {
-    return vf && strcmp(vf->info->name, "scale") == 0;
+    return vf && (strcmp(vf->info->name, "scale") == 0 || vf->autoinserted);
+}
+
+static const char *find_conv_filter(uint8_t *fmts_in, uint8_t *fmts_out)
+{
+    for (int n = 0; filter_list[n]; n++) {
+        if (filter_list[n]->test_conversion) {
+            for (int a = IMGFMT_START; a < IMGFMT_END; a++) {
+                for (int b = IMGFMT_START; b < IMGFMT_END; b++) {
+                    if (fmts_in[a - IMGFMT_START] && fmts_out[b - IMGFMT_START] &&
+                        filter_list[n]->test_conversion(a, b))
+                        return filter_list[n]->name;
+                }
+            }
+        }
+    }
+    return "scale";
 }
 
 static void update_formats(struct vf_chain *c, struct vf_instance *vf,
@@ -535,7 +555,18 @@ static void update_formats(struct vf_chain *c, struct vf_instance *vf,
         // filters after vf work, but vf can't output any format the filters
         // after it accept), try to insert a conversion filter.
         MP_INFO(c, "Using conversion filter.\n");
-        struct vf_instance *conv = vf_open(c, "scale", NULL);
+        // Determine which output formats the filter _could_ accept. For this
+        // to work after the conversion filter is inserted, it is assumed that
+        // conversion filters have a single set of in/output formats that can
+        // be converted to each other.
+        uint8_t out_formats[IMGFMT_END - IMGFMT_START];
+        for (int n = IMGFMT_START; n < IMGFMT_END; n++) {
+            out_formats[n - IMGFMT_START] = vf->last_outfmts[n - IMGFMT_START];
+            vf->last_outfmts[n - IMGFMT_START] = 1;
+        }
+        query_formats(fmts, vf);
+        const char *filter = find_conv_filter(fmts, out_formats);
+        struct vf_instance *conv = vf_open(c, filter, NULL);
         if (conv) {
             conv->autoinserted = true;
             conv->next = vf->next;
diff --git a/video/filter/vf.h b/video/filter/vf.h
index c982b61..49296fb 100644
--- a/video/filter/vf.h
+++ b/video/filter/vf.h
@@ -37,6 +37,7 @@ typedef struct vf_info {
     const void *priv_defaults;
     const struct m_option *options;
     void (*print_help)(struct mp_log *log);
+    bool (*test_conversion)(int in, int out);
 } vf_info_t;
 
 typedef struct vf_instance {
@@ -92,7 +93,7 @@ typedef struct vf_instance {
     struct mp_image_pool *out_pool;
     struct vf_priv_s *priv;
     struct mp_log *log;
-    struct mp_hwdec_info *hwdec;
+    struct mp_hwdec_devices *hwdec_devs;
 
     struct mp_image **out_queued;
     int num_out_queued;
@@ -120,7 +121,7 @@ struct vf_chain {
     struct mp_log *log;
     struct MPOpts *opts;
     struct mpv_global *global;
-    struct mp_hwdec_info *hwdec;
+    struct mp_hwdec_devices *hwdec_devs;
 
     // Call when the filter chain wants new processing (for filters with
     // asynchronous behavior) - must be immutable once filters are created,
diff --git a/video/filter/vf_crop.c b/video/filter/vf_crop.c
index 89b2b6f..6f9a788 100644
--- a/video/filter/vf_crop.c
+++ b/video/filter/vf_crop.c
@@ -51,10 +51,23 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
     if(vf->priv->crop_y<0) vf->priv->crop_y=(height-vf->priv->crop_h)/2;
     // rounding:
 
+    int orig_x = vf->priv->crop_x;
+    int orig_y = vf->priv->crop_y;
+
     struct mp_imgfmt_desc fmt = mp_imgfmt_get_desc(in->imgfmt);
 
-    vf->priv->crop_x = MP_ALIGN_DOWN(vf->priv->crop_x, fmt.align_x);
-    vf->priv->crop_y = MP_ALIGN_DOWN(vf->priv->crop_y, fmt.align_y);
+    if (fmt.flags & MP_IMGFLAG_HWACCEL) {
+        vf->priv->crop_x = 0;
+        vf->priv->crop_y = 0;
+    } else {
+        vf->priv->crop_x = MP_ALIGN_DOWN(vf->priv->crop_x, fmt.align_x);
+        vf->priv->crop_y = MP_ALIGN_DOWN(vf->priv->crop_y, fmt.align_y);
+    }
+
+    if (vf->priv->crop_x != orig_x || vf->priv->crop_y != orig_y) {
+        MP_WARN(vf, "Adjusting crop origin to %d/%d for pixel format alignment.\n",
+                vf->priv->crop_x, vf->priv->crop_y);
+    }
 
     // check:
     if(vf->priv->crop_w+vf->priv->crop_x>width ||
@@ -71,17 +84,19 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
 
 static struct mp_image *filter(struct vf_instance *vf, struct mp_image *mpi)
 {
-    mp_image_crop(mpi, vf->priv->crop_x, vf->priv->crop_y,
-                  vf->priv->crop_x + vf->priv->crop_w,
-                  vf->priv->crop_y + vf->priv->crop_h);
+    if (mpi->fmt.flags & MP_IMGFLAG_HWACCEL) {
+        mp_image_set_size(mpi, vf->fmt_out.w, vf->fmt_out.h);
+    } else {
+        mp_image_crop(mpi, vf->priv->crop_x, vf->priv->crop_y,
+                      vf->priv->crop_x + vf->priv->crop_w,
+                      vf->priv->crop_y + vf->priv->crop_h);
+    }
     return mpi;
 }
 
 static int query_format(struct vf_instance *vf, unsigned int fmt)
 {
-    if (!IMGFMT_IS_HWACCEL(fmt))
-        return vf_next_query_format(vf, fmt);
-    return 0;
+    return vf_next_query_format(vf, fmt);
 }
 
 static int vf_open(vf_instance_t *vf){
diff --git a/video/filter/vf_d3d11vpp.c b/video/filter/vf_d3d11vpp.c
new file mode 100644
index 0000000..a0aa0ed
--- /dev/null
+++ b/video/filter/vf_d3d11vpp.c
@@ -0,0 +1,537 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <assert.h>
+#include <windows.h>
+#include <d3d11.h>
+
+#include "common/common.h"
+#include "osdep/timer.h"
+#include "osdep/windows_utils.h"
+#include "vf.h"
+#include "refqueue.h"
+#include "video/hwdec.h"
+#include "video/mp_image_pool.h"
+
+// missing in MinGW
+#define D3D11_VIDEO_PROCESSOR_PROCESSOR_CAPS_DEINTERLACE_BOB 0x2
+
+struct vf_priv_s {
+    ID3D11Device *vo_dev;
+
+    ID3D11DeviceContext *device_ctx;
+    ID3D11VideoDevice *video_dev;
+    ID3D11VideoContext *video_ctx;
+
+    ID3D11VideoProcessor *video_proc;
+    ID3D11VideoProcessorEnumerator *vp_enum;
+    D3D11_VIDEO_FRAME_FORMAT d3d_frame_format;
+
+    DXGI_FORMAT out_format;
+    bool out_shared;
+    bool out_rgb;
+
+    bool require_filtering;
+
+    struct mp_image_params params, out_params;
+    int c_w, c_h;
+
+    struct mp_image_pool *pool;
+
+    struct mp_refqueue *queue;
+
+    int deint_enabled;
+    int interlaced_only;
+};
+
+static void release_tex(void *arg)
+{
+    ID3D11Texture2D *texture = arg;
+
+    ID3D11Texture2D_Release(texture);
+}
+
+static struct mp_image *alloc_pool(void *pctx, int fmt, int w, int h)
+{
+    struct vf_instance *vf = pctx;
+    struct vf_priv_s *p = vf->priv;
+    HRESULT hr;
+
+    ID3D11Texture2D *texture = NULL;
+    D3D11_TEXTURE2D_DESC texdesc = {
+        .Width = w,
+        .Height = h,
+        .Format = p->out_format,
+        .MipLevels = 1,
+        .ArraySize = 1,
+        .SampleDesc = { .Count = 1 },
+        .Usage = D3D11_USAGE_DEFAULT,
+        .BindFlags = D3D11_BIND_RENDER_TARGET | D3D11_BIND_SHADER_RESOURCE,
+        .MiscFlags = p->out_shared ? D3D11_RESOURCE_MISC_SHARED : 0,
+    };
+    hr = ID3D11Device_CreateTexture2D(p->vo_dev, &texdesc, NULL, &texture);
+    if (FAILED(hr))
+        return NULL;
+
+    struct mp_image *mpi = mp_image_new_custom_ref(NULL, texture, release_tex);
+    if (!mpi)
+        abort();
+
+    mp_image_setfmt(mpi, p->out_params.imgfmt);
+    mp_image_set_size(mpi, w, h);
+    mpi->params.hw_subfmt = p->out_params.hw_subfmt;
+
+    mpi->planes[1] = (void *)texture;
+    mpi->planes[2] = (void *)(intptr_t)0;
+
+    return mpi;
+}
+
+static void flush_frames(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+    mp_refqueue_flush(p->queue);
+}
+
+static int filter_ext(struct vf_instance *vf, struct mp_image *in)
+{
+    struct vf_priv_s *p = vf->priv;
+
+    mp_refqueue_set_refs(p->queue, 0, 0);
+    mp_refqueue_set_mode(p->queue,
+        (p->deint_enabled ? MP_MODE_DEINT : 0) |
+        MP_MODE_OUTPUT_FIELDS |
+        (p->interlaced_only ? MP_MODE_INTERLACED_ONLY : 0));
+
+    mp_refqueue_add_input(p->queue, in);
+    return 0;
+}
+
+static void destroy_video_proc(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+
+    if (p->video_proc)
+        ID3D11VideoProcessor_Release(p->video_proc);
+    p->video_proc = NULL;
+
+    if (p->vp_enum)
+        ID3D11VideoProcessorEnumerator_Release(p->vp_enum);
+    p->vp_enum = NULL;
+}
+
+static int recreate_video_proc(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+    HRESULT hr;
+
+    destroy_video_proc(vf);
+
+    D3D11_VIDEO_PROCESSOR_CONTENT_DESC vpdesc = {
+        .InputFrameFormat = p->d3d_frame_format,
+        .InputWidth = p->c_w,
+        .InputHeight = p->c_h,
+        .OutputWidth = p->params.w,
+        .OutputHeight = p->params.h,
+    };
+    hr = ID3D11VideoDevice_CreateVideoProcessorEnumerator(p->video_dev, &vpdesc,
+                                                          &p->vp_enum);
+    if (FAILED(hr))
+        goto fail;
+
+    D3D11_VIDEO_PROCESSOR_CAPS caps;
+    hr = ID3D11VideoProcessorEnumerator_GetVideoProcessorCaps(p->vp_enum, &caps);
+    if (FAILED(hr))
+        goto fail;
+
+    MP_VERBOSE(vf, "Found %d rate conversion caps.\n",
+               (int)caps.RateConversionCapsCount);
+
+    int rindex = -1;
+    for (int n = 0; n < caps.RateConversionCapsCount; n++) {
+        D3D11_VIDEO_PROCESSOR_RATE_CONVERSION_CAPS rcaps;
+        hr = ID3D11VideoProcessorEnumerator_GetVideoProcessorRateConversionCaps
+                (p->vp_enum, n, &rcaps);
+        if (FAILED(hr))
+            goto fail;
+        MP_VERBOSE(vf, "  - %d: 0x%08x\n", n, (unsigned)rcaps.ProcessorCaps);
+        if (rcaps.ProcessorCaps & D3D11_VIDEO_PROCESSOR_PROCESSOR_CAPS_DEINTERLACE_BOB)
+        {
+            MP_VERBOSE(vf, "       (matching)\n");
+            if (rindex < 0)
+                rindex = n;
+        }
+    }
+
+    if (rindex < 0) {
+        MP_WARN(vf, "No video deinterlacing processor found.\n");
+        rindex = 0;
+    }
+
+    hr = ID3D11VideoDevice_CreateVideoProcessor(p->video_dev, p->vp_enum, rindex,
+                                                &p->video_proc);
+    if (FAILED(hr)) {
+        MP_ERR(vf, "Failed to create D3D11 video processor.\n");
+        goto fail;
+    }
+
+    // Note: libavcodec does not support cropping left/top with hwaccel.
+    RECT src_rc = {
+        .right = p->params.w,
+        .bottom = p->params.h,
+    };
+    ID3D11VideoContext_VideoProcessorSetStreamSourceRect(p->video_ctx,
+                                                         p->video_proc,
+                                                         0, TRUE, &src_rc);
+
+    // This is supposed to stop drivers from fucking up the video quality.
+    ID3D11VideoContext_VideoProcessorSetStreamAutoProcessingMode(p->video_ctx,
+                                                                 p->video_proc,
+                                                                 0, FALSE);
+
+    ID3D11VideoContext_VideoProcessorSetStreamOutputRate(p->video_ctx,
+                                                         p->video_proc,
+                                                         0,
+                                                         D3D11_VIDEO_PROCESSOR_OUTPUT_RATE_NORMAL,
+                                                         FALSE, 0);
+
+    D3D11_VIDEO_PROCESSOR_COLOR_SPACE csp = {
+        .YCbCr_Matrix = p->params.colorspace != MP_CSP_BT_601,
+        .Nominal_Range = p->params.colorlevels == MP_CSP_LEVELS_TV ? 1 : 2,
+    };
+    ID3D11VideoContext_VideoProcessorSetStreamColorSpace(p->video_ctx,
+                                                         p->video_proc,
+                                                         0, &csp);
+    if (p->out_rgb) {
+        if (p->params.colorspace != MP_CSP_BT_601 &&
+            p->params.colorspace != MP_CSP_BT_709)
+        {
+            MP_WARN(vf, "Unsupported video colorspace (%s/%s). Consider "
+                    "disabling hardware decoding, or using "
+                    "--hwdec=d3d11va-copy to get correct output.\n",
+                    m_opt_choice_str(mp_csp_names, p->params.colorspace),
+                    m_opt_choice_str(mp_csp_levels_names, p->params.colorlevels));
+        }
+    } else {
+        ID3D11VideoContext_VideoProcessorSetOutputColorSpace(p->video_ctx,
+                                                             p->video_proc,
+                                                             &csp);
+    }
+
+    return 0;
+fail:
+    destroy_video_proc(vf);
+    return -1;
+}
+
+static int render(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+    int res = -1;
+    HRESULT hr;
+    ID3D11VideoProcessorInputView *in_view = NULL;
+    ID3D11VideoProcessorOutputView *out_view = NULL;
+    struct mp_image *in = NULL, *out = NULL;
+    out = mp_image_pool_get(p->pool, p->out_params.imgfmt, p->params.w, p->params.h);
+    if (!out)
+        goto cleanup;
+
+    ID3D11Texture2D *d3d_out_tex = (void *)out->planes[1];
+
+    in = mp_refqueue_get(p->queue, 0);
+    if (!in)
+        goto cleanup;
+    ID3D11Texture2D *d3d_tex = (void *)in->planes[1];
+    int d3d_subindex = (intptr_t)in->planes[2];
+
+    mp_image_copy_attributes(out, in);
+
+    D3D11_VIDEO_FRAME_FORMAT d3d_frame_format;
+    if (!mp_refqueue_is_interlaced(p->queue)) {
+        d3d_frame_format = D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE;
+    } else if (mp_refqueue_top_field_first(p->queue)) {
+        d3d_frame_format = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_TOP_FIELD_FIRST;
+    } else {
+        d3d_frame_format = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_BOTTOM_FIELD_FIRST;
+    }
+
+    D3D11_TEXTURE2D_DESC texdesc;
+    ID3D11Texture2D_GetDesc(d3d_tex, &texdesc);
+    if (!p->video_proc || p->c_w != texdesc.Width || p->c_h != texdesc.Height ||
+        p->d3d_frame_format != d3d_frame_format)
+    {
+        p->c_w = texdesc.Width;
+        p->c_h = texdesc.Height;
+        p->d3d_frame_format = d3d_frame_format;
+        if (recreate_video_proc(vf) < 0)
+            goto cleanup;
+    }
+
+    if (!mp_refqueue_is_interlaced(p->queue)) {
+        d3d_frame_format = D3D11_VIDEO_FRAME_FORMAT_PROGRESSIVE;
+    } else if (mp_refqueue_is_top_field(p->queue)) {
+        d3d_frame_format = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_TOP_FIELD_FIRST;
+    } else {
+        d3d_frame_format = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_BOTTOM_FIELD_FIRST;
+    }
+
+    ID3D11VideoContext_VideoProcessorSetStreamFrameFormat(p->video_ctx,
+                                                          p->video_proc,
+                                                          0, d3d_frame_format);
+
+    D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC indesc = {
+        .ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D,
+        .Texture2D = {
+            .ArraySlice = d3d_subindex,
+        },
+    };
+    hr = ID3D11VideoDevice_CreateVideoProcessorInputView(p->video_dev,
+                                                         (ID3D11Resource *)d3d_tex,
+                                                         p->vp_enum, &indesc,
+                                                         &in_view);
+    if (FAILED(hr)) {
+        MP_ERR(vf, "Could not create ID3D11VideoProcessorInputView\n");
+        goto cleanup;
+    }
+
+    D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC outdesc = {
+        .ViewDimension = D3D11_VPOV_DIMENSION_TEXTURE2D,
+    };
+    hr = ID3D11VideoDevice_CreateVideoProcessorOutputView(p->video_dev,
+                                                          (ID3D11Resource *)d3d_out_tex,
+                                                          p->vp_enum, &outdesc,
+                                                          &out_view);
+    if (FAILED(hr))
+        goto cleanup;
+
+    D3D11_VIDEO_PROCESSOR_STREAM stream = {
+        .Enable = TRUE,
+        .pInputSurface = in_view,
+    };
+    int frame = mp_refqueue_is_second_field(p->queue);
+    hr = ID3D11VideoContext_VideoProcessorBlt(p->video_ctx, p->video_proc,
+                                              out_view, frame, 1, &stream);
+    if (FAILED(hr)) {
+        MP_ERR(vf, "VideoProcessorBlt failed.\n");
+        goto cleanup;
+    }
+
+    // Make sure the texture is updated correctly on the shared context.
+    // (I'm not sure if this is correct, though it won't harm.)
+    if (p->out_shared)
+        ID3D11DeviceContext_Flush(p->device_ctx);
+
+    res = 0;
+cleanup:
+    if (in_view)
+        ID3D11VideoProcessorInputView_Release(in_view);
+    if (out_view)
+        ID3D11VideoProcessorOutputView_Release(out_view);
+    if (res >= 0) {
+        vf_add_output_frame(vf, out);
+    } else {
+        talloc_free(out);
+    }
+    mp_refqueue_next_field(p->queue);
+    return res;
+}
+
+static int filter_out(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+
+    if (!mp_refqueue_has_output(p->queue))
+        return 0;
+
+    // no filtering
+    if (!mp_refqueue_should_deint(p->queue) && !p->require_filtering) {
+        struct mp_image *in = mp_refqueue_get(p->queue, 0);
+        vf_add_output_frame(vf, mp_image_new_ref(in));
+        mp_refqueue_next(p->queue);
+        return 0;
+    }
+
+    return render(vf);
+}
+
+static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
+                    struct mp_image_params *out)
+{
+    struct vf_priv_s *p = vf->priv;
+
+    flush_frames(vf);
+    talloc_free(p->pool);
+    p->pool = NULL;
+
+    destroy_video_proc(vf);
+
+    *out = *in;
+
+    if (vf_next_query_format(vf, IMGFMT_D3D11VA) ||
+        vf_next_query_format(vf, IMGFMT_D3D11NV12))
+    {
+        out->imgfmt = vf_next_query_format(vf, IMGFMT_D3D11VA)
+                    ? IMGFMT_D3D11VA : IMGFMT_D3D11NV12;
+        out->hw_subfmt = IMGFMT_NV12;
+        p->out_format = DXGI_FORMAT_NV12;
+        p->out_shared = false;
+        p->out_rgb = false;
+    } else {
+        out->imgfmt = IMGFMT_D3D11RGB;
+        out->hw_subfmt = IMGFMT_RGB0;
+        p->out_format = DXGI_FORMAT_B8G8R8A8_UNORM;
+        p->out_shared = true;
+        p->out_rgb = true;
+    }
+
+    p->require_filtering = in->hw_subfmt != out->hw_subfmt;
+
+    p->params = *in;
+    p->out_params = *out;
+
+    p->pool = mp_image_pool_new(20);
+    mp_image_pool_set_allocator(p->pool, alloc_pool, vf);
+    mp_image_pool_set_lru(p->pool);
+
+    return 0;
+}
+
+static void uninit(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+
+    destroy_video_proc(vf);
+
+    flush_frames(vf);
+    mp_refqueue_free(p->queue);
+    talloc_free(p->pool);
+
+    if (p->video_ctx)
+        ID3D11VideoContext_Release(p->video_ctx);
+
+    if (p->video_dev)
+        ID3D11VideoDevice_Release(p->video_dev);
+
+    if (p->device_ctx)
+        ID3D11DeviceContext_Release(p->device_ctx);
+
+    if (p->vo_dev)
+        ID3D11Device_Release(p->vo_dev);
+}
+
+static int query_format(struct vf_instance *vf, unsigned int imgfmt)
+{
+    if (imgfmt == IMGFMT_D3D11VA ||
+        imgfmt == IMGFMT_D3D11NV12 ||
+        imgfmt == IMGFMT_D3D11RGB)
+    {
+        return vf_next_query_format(vf, IMGFMT_D3D11VA) ||
+               vf_next_query_format(vf, IMGFMT_D3D11NV12) ||
+               vf_next_query_format(vf, IMGFMT_D3D11RGB);
+    }
+    return 0;
+}
+
+static bool test_conversion(int in, int out)
+{
+    return (in == IMGFMT_D3D11VA ||
+            in == IMGFMT_D3D11NV12 ||
+            in == IMGFMT_D3D11RGB) &&
+           (out == IMGFMT_D3D11VA ||
+            out == IMGFMT_D3D11NV12 ||
+            out == IMGFMT_D3D11RGB);
+}
+
+static int control(struct vf_instance *vf, int request, void* data)
+{
+    struct vf_priv_s *p = vf->priv;
+    switch (request){
+    case VFCTRL_GET_DEINTERLACE:
+        *(int*)data = !!p->deint_enabled;
+        return true;
+    case VFCTRL_SET_DEINTERLACE:
+        p->deint_enabled = !!*(int*)data;
+        return true;
+    case VFCTRL_SEEK_RESET:
+        flush_frames(vf);
+        return true;
+    default:
+        return CONTROL_UNKNOWN;
+    }
+}
+
+static int vf_open(vf_instance_t *vf)
+{
+    struct vf_priv_s *p = vf->priv;
+
+    vf->reconfig = reconfig;
+    vf->filter_ext = filter_ext;
+    vf->filter_out = filter_out;
+    vf->query_format = query_format;
+    vf->uninit = uninit;
+    vf->control = control;
+
+    p->queue = mp_refqueue_alloc();
+
+    p->vo_dev = hwdec_devices_load(vf->hwdec_devs, HWDEC_D3D11VA);
+    if (!p->vo_dev)
+        return 0;
+
+    ID3D11Device_AddRef(p->vo_dev);
+
+    HRESULT hr;
+
+    hr = ID3D11Device_QueryInterface(p->vo_dev, &IID_ID3D11VideoDevice,
+                                     (void **)&p->video_dev);
+    if (FAILED(hr))
+        goto fail;
+
+    ID3D11Device_GetImmediateContext(p->vo_dev, &p->device_ctx);
+    if (!p->device_ctx)
+        goto fail;
+    hr = ID3D11DeviceContext_QueryInterface(p->device_ctx, &IID_ID3D11VideoContext,
+                                            (void **)&p->video_ctx);
+    if (FAILED(hr))
+        goto fail;
+
+    return 1;
+
+fail:
+    uninit(vf);
+    return 0;
+}
+
+#define OPT_BASE_STRUCT struct vf_priv_s
+static const m_option_t vf_opts_fields[] = {
+    OPT_FLAG("deint", deint_enabled, 0),
+    OPT_FLAG("interlaced-only", interlaced_only, 0),
+    {0}
+};
+
+const vf_info_t vf_info_d3d11vpp = {
+    .description = "D3D11 Video Post-Process Filter",
+    .name = "d3d11vpp",
+    .test_conversion = test_conversion,
+    .open = vf_open,
+    .priv_size = sizeof(struct vf_priv_s),
+    .priv_defaults = &(const struct vf_priv_s) {
+        .deint_enabled = 1,
+        .interlaced_only = 1,
+    },
+    .options = vf_opts_fields,
+};
diff --git a/video/filter/vf_format.c b/video/filter/vf_format.c
index ff7389c..109fda4 100644
--- a/video/filter/vf_format.c
+++ b/video/filter/vf_format.c
@@ -38,6 +38,7 @@ struct vf_priv_s {
     int colorlevels;
     int primaries;
     int gamma;
+    float peak;
     int chroma_location;
     int stereo_in;
     int stereo_out;
@@ -94,6 +95,8 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
         out->primaries = p->primaries;
     if (p->gamma)
         out->gamma = p->gamma;
+    if (p->peak)
+        out->peak = p->peak;
     if (p->chroma_location)
         out->chroma_location = p->chroma_location;
     if (p->stereo_in)
@@ -142,6 +145,7 @@ static const m_option_t vf_opts_fields[] = {
     OPT_CHOICE_C("colorlevels", colorlevels, 0, mp_csp_levels_names),
     OPT_CHOICE_C("primaries", primaries, 0, mp_csp_prim_names),
     OPT_CHOICE_C("gamma", gamma, 0, mp_csp_trc_names),
+    OPT_FLOAT("peak", peak, 0),
     OPT_CHOICE_C("chroma-location", chroma_location, 0, mp_chroma_names),
     OPT_CHOICE_C("stereo-in", stereo_in, 0, mp_stereo3d_names),
     OPT_CHOICE_C("stereo-out", stereo_out, 0, mp_stereo3d_names),
diff --git a/video/filter/vf_vavpp.c b/video/filter/vf_vavpp.c
index ae1d6b5..0365b55 100644
--- a/video/filter/vf_vavpp.c
+++ b/video/filter/vf_vavpp.c
@@ -23,6 +23,7 @@
 #include "config.h"
 #include "options/options.h"
 #include "vf.h"
+#include "refqueue.h"
 #include "video/vaapi.h"
 #include "video/hwdec.h"
 #include "video/mp_image_pool.h"
@@ -40,13 +41,6 @@ struct surface_refs {
     int num_surfaces;
 };
 
-static void add_surface(void *ta_ctx, struct surface_refs *refs, struct mp_image *s)
-{
-    VASurfaceID id = va_surface_id(s);
-    if (id != VA_INVALID_ID)
-        MP_TARRAY_APPEND(ta_ctx, refs->surfaces, refs->num_surfaces, id);
-}
-
 struct pipeline {
     VABufferID *filters;
     int num_filters;
@@ -71,16 +65,7 @@ struct vf_priv_s {
     struct mp_image_pool *pool;
     int current_rt_format;
 
-    int needed_future_frames;
-    int needed_past_frames;
-
-    // Queue of input frames, used to determine past/current/future frames.
-    // queue[0] is the newest frame, queue[num_queue - 1] the oldest.
-    struct mp_image **queue;
-    int num_queue;
-    // queue[current_pos] is the current frame, unless current_pos is not a
-    // valid index.
-    int current_pos;
+    struct mp_refqueue *queue;
 };
 
 static const struct vf_priv_s vf_priv_default = {
@@ -90,6 +75,18 @@ static const struct vf_priv_s vf_priv_default = {
     .interlaced_only = 1,
 };
 
+static void add_surfaces(struct vf_priv_s *p, struct surface_refs *refs, int dir)
+{
+    for (int n = 0; ; n++) {
+        struct mp_image *s = mp_refqueue_get(p->queue, (1 + n) * dir);
+        if (!s)
+            break;
+        VASurfaceID id = va_surface_id(s);
+        if (id != VA_INVALID_ID)
+            MP_TARRAY_APPEND(p, refs->surfaces, refs->num_surfaces, id);
+    }
+}
+
 // The array items must match with the "deint" suboption values.
 static const int deint_algorithm[] = {
     [0] = VAProcDeinterlacingNone,
@@ -103,72 +100,82 @@ static const int deint_algorithm[] = {
 static void flush_frames(struct vf_instance *vf)
 {
     struct vf_priv_s *p = vf->priv;
-    for (int n = 0; n < p->num_queue; n++)
-        talloc_free(p->queue[n]);
-    p->num_queue = 0;
-    p->current_pos = -1;
+    mp_refqueue_flush(p->queue);
 }
 
-static bool update_pipeline(struct vf_instance *vf, bool deint)
+static void update_pipeline(struct vf_instance *vf)
 {
     struct vf_priv_s *p = vf->priv;
     VABufferID *filters = p->buffers;
     int num_filters = p->num_buffers;
-    if (p->deint_type && !deint) {
+    if (p->deint_type && !p->do_deint) {
         filters++;
         num_filters--;
     }
     if (filters == p->pipe.filters && num_filters == p->pipe.num_filters)
-        return true;
+        return; /* cached state is correct */
     p->pipe.forward.num_surfaces = p->pipe.backward.num_surfaces = 0;
     p->pipe.num_input_colors = p->pipe.num_output_colors = 0;
     p->pipe.num_filters = 0;
     p->pipe.filters = NULL;
     if (!num_filters)
-        return false;
-    VAProcPipelineCaps caps;
-    caps.input_color_standards = p->pipe.input_colors;
-    caps.output_color_standards = p->pipe.output_colors;
-    caps.num_input_color_standards = VAProcColorStandardCount;
-    caps.num_output_color_standards = VAProcColorStandardCount;
+        goto nodeint;
+    VAProcPipelineCaps caps = {
+        .input_color_standards = p->pipe.input_colors,
+        .output_color_standards = p->pipe.output_colors,
+        .num_input_color_standards = VAProcColorStandardCount,
+        .num_output_color_standards = VAProcColorStandardCount,
+    };
     VAStatus status = vaQueryVideoProcPipelineCaps(p->display, p->context,
                                                    filters, num_filters, &caps);
     if (!check_error(vf, status, "vaQueryVideoProcPipelineCaps()"))
-        return false;
+        goto nodeint;
     p->pipe.filters = filters;
     p->pipe.num_filters = num_filters;
     p->pipe.num_input_colors = caps.num_input_color_standards;
     p->pipe.num_output_colors = caps.num_output_color_standards;
-    p->needed_future_frames = caps.num_forward_references;
-    p->needed_past_frames = caps.num_backward_references;
-    return true;
-}
+    mp_refqueue_set_refs(p->queue, caps.num_backward_references,
+                                   caps.num_forward_references);
+    mp_refqueue_set_mode(p->queue,
+        (p->do_deint ? MP_MODE_DEINT : 0) |
+        (p->deint_type >= 2 ? MP_MODE_OUTPUT_FIELDS : 0) |
+        (p->interlaced_only ? MP_MODE_INTERLACED_ONLY : 0));
+    return;
 
-static inline int get_deint_field(struct vf_priv_s *p, int i,
-                                  struct mp_image *mpi)
-{
-    if (!p->do_deint || !(mpi->fields & MP_IMGFIELD_INTERLACED))
-        return VA_FRAME_PICTURE;
-    return !!(mpi->fields & MP_IMGFIELD_TOP_FIRST) ^ i ? VA_TOP_FIELD : VA_BOTTOM_FIELD;
+nodeint:
+    mp_refqueue_set_refs(p->queue, 0, 0);
+    mp_refqueue_set_mode(p->queue, 0);
 }
 
-static struct mp_image *render(struct vf_instance *vf, struct mp_image *in,
-                               unsigned int flags)
+static struct mp_image *render(struct vf_instance *vf)
 {
     struct vf_priv_s *p = vf->priv;
+
+    struct mp_image *in = mp_refqueue_get(p->queue, 0);
+    struct mp_image *img = NULL;
+    bool need_end_picture = false;
+    bool success = false;
+
     VASurfaceID in_id = va_surface_id(in);
     if (!p->pipe.filters || in_id == VA_INVALID_ID)
-        return NULL;
+        goto cleanup;
 
     int r_w, r_h;
     va_surface_get_uncropped_size(in, &r_w, &r_h);
-    struct mp_image *img = mp_image_pool_get(p->pool, IMGFMT_VAAPI, r_w, r_h);
+    img = mp_image_pool_get(p->pool, IMGFMT_VAAPI, r_w, r_h);
     if (!img)
-        return NULL;
+        goto cleanup;
     mp_image_set_size(img, in->w, in->h);
-
-    bool need_end_picture = false;
-    bool success = false;
+    mp_image_copy_attributes(img, in);
+
+    unsigned int flags = va_get_colorspace_flag(p->params.colorspace);
+    if (!mp_refqueue_is_interlaced(p->queue)) {
+        flags |= VA_FRAME_PICTURE;
+    } else if (mp_refqueue_is_top_field(p->queue)) {
+        flags |= VA_TOP_FIELD;
+    } else {
+        flags |= VA_BOTTOM_FIELD;
+    }
 
     VASurfaceID id = va_surface_id(img);
     if (id == VA_INVALID_ID)
@@ -194,7 +201,7 @@ static struct mp_image *render(struct vf_instance *vf, struct mp_image *in,
         goto cleanup;
 
     filter_params->flags = flags & VA_TOP_FIELD ? 0 : VA_DEINTERLACING_BOTTOM_FIELD;
-    if (!(in->fields & MP_IMGFIELD_TOP_FIRST))
+    if (!mp_refqueue_top_field_first(p->queue))
         filter_params->flags |= VA_DEINTERLACING_BOTTOM_FIELD_FIRST;
 
     vaUnmapBuffer(p->display, *(p->pipe.filters));
@@ -211,19 +218,11 @@ static struct mp_image *render(struct vf_instance *vf, struct mp_image *in,
     param->filters = p->pipe.filters;
     param->num_filters = p->pipe.num_filters;
 
-    for (int n = 0; n < p->needed_future_frames; n++) {
-        int idx = p->current_pos - 1 - n;
-        if (idx >= 0 && idx < p->num_queue)
-            add_surface(p, &p->pipe.forward, p->queue[idx]);
-    }
+    add_surfaces(p, &p->pipe.forward, 1);
     param->forward_references = p->pipe.forward.surfaces;
     param->num_forward_references = p->pipe.forward.num_surfaces;
 
-    for (int n = 0; n < p->needed_past_frames; n++) {
-        int idx = p->current_pos + 1 + n;
-        if (idx >= 0 && idx < p->num_queue)
-            add_surface(p, &p->pipe.backward, p->queue[idx]);
-    }
+    add_surfaces(p, &p->pipe.backward, -1);
     param->backward_references = p->pipe.backward.surfaces;
     param->num_backward_references = p->pipe.backward.num_surfaces;
 
@@ -244,47 +243,6 @@ cleanup:
     return NULL;
 }
 
-static void output_frames(struct vf_instance *vf)
-{
-    struct vf_priv_s *p = vf->priv;
-
-    struct mp_image *in = p->queue[p->current_pos];
-    double prev_pts = p->current_pos + 1 < p->num_queue
-        ? p->queue[p->current_pos + 1]->pts : MP_NOPTS_VALUE;
-
-    bool deint = p->do_deint && p->deint_type > 0;
-    if (!update_pipeline(vf, deint) || !p->pipe.filters) { // no filtering
-        vf_add_output_frame(vf, mp_image_new_ref(in));
-        return;
-    }
-    unsigned int csp = va_get_colorspace_flag(p->params.colorspace);
-    unsigned int field = get_deint_field(p, 0, in);
-    if (field == VA_FRAME_PICTURE && p->interlaced_only) {
-        vf_add_output_frame(vf, mp_image_new_ref(in));
-        return;
-    }
-    struct mp_image *out1 = render(vf, in, field | csp);
-    if (!out1) { // cannot render
-        vf_add_output_frame(vf, mp_image_new_ref(in));
-        return;
-    }
-    mp_image_copy_attributes(out1, in);
-    vf_add_output_frame(vf, out1);
-    // first-field only
-    if (field == VA_FRAME_PICTURE || (p->do_deint && p->deint_type < 2))
-        return;
-    double add = (in->pts - prev_pts) * 0.5;
-    if (prev_pts == MP_NOPTS_VALUE || add <= 0.0 || add > 0.5) // no pts, skip it
-        return;
-    struct mp_image *out2 = render(vf, in, get_deint_field(p, 1, in) | csp);
-    if (!out2) // cannot render
-        return;
-    mp_image_copy_attributes(out2, in);
-    out2->pts = in->pts + add;
-    vf_add_output_frame(vf, out2);
-    return;
-}
-
 static struct mp_image *upload(struct vf_instance *vf, struct mp_image *in)
 {
     struct vf_priv_s *p = vf->priv;
@@ -303,45 +261,40 @@ static int filter_ext(struct vf_instance *vf, struct mp_image *in)
 {
     struct vf_priv_s *p = vf->priv;
 
-    if (in) {
-        int rt_format = in->imgfmt == IMGFMT_VAAPI ? va_surface_rt_format(in)
-                                                   : VA_RT_FORMAT_YUV420;
-        if (!p->pool || p->current_rt_format != rt_format) {
-            talloc_free(p->pool);
-            p->pool = mp_image_pool_new(20);
-            va_pool_set_allocator(p->pool, p->va, rt_format);
-            p->current_rt_format = rt_format;
-        }
-        if (in->imgfmt != IMGFMT_VAAPI) {
-            struct mp_image *tmp = upload(vf, in);
-            talloc_free(in);
-            in = tmp;
-            if (!in)
-                return -1;
-        }
-    }
+    update_pipeline(vf);
 
-    if (in) {
-        MP_TARRAY_INSERT_AT(p, p->queue, p->num_queue, 0, in);
-        p->current_pos++;
-        assert(p->num_queue != 1 || p->current_pos == 0);
+    if (in && in->imgfmt != IMGFMT_VAAPI) {
+        struct mp_image *tmp = upload(vf, in);
+        talloc_free(in);
+        in = tmp;
+        if (!in)
+            return -1;
     }
 
-    // Discard unneeded past frames.
-    // Note that we keep at least 1 past frame (for PTS calculations).
-    while (p->num_queue - (p->current_pos + 1) > MPMAX(p->needed_past_frames, 1)) {
-        assert(p->num_queue > 0);
-        talloc_free(p->queue[p->num_queue - 1]);
-        p->num_queue--;
-    }
+    mp_refqueue_add_input(p->queue, in);
+    return 0;
+}
+
+static int filter_out(struct vf_instance *vf)
+{
+    struct vf_priv_s *p = vf->priv;
 
-    if (p->current_pos < p->needed_future_frames && in)
-        return 0; // wait until future frames have been filled
+    if (!mp_refqueue_has_output(p->queue))
+        return 0;
 
-    if (p->current_pos >= 0 && p->current_pos < p->num_queue) {
-        output_frames(vf);
-        p->current_pos--;
+    // no filtering
+    if (!p->pipe.num_filters || !mp_refqueue_should_deint(p->queue)) {
+        struct mp_image *in = mp_refqueue_get(p->queue, 0);
+        vf_add_output_frame(vf, mp_image_new_ref(in));
+        mp_refqueue_next(p->queue);
+        return 0;
     }
+
+    struct mp_image *out = render(vf);
+    mp_refqueue_next_field(p->queue);
+    if (!out)
+        return -1; // cannot render
+    vf_add_output_frame(vf, out);
     return 0;
 }
 
@@ -350,10 +303,25 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
 {
     struct vf_priv_s *p = vf->priv;
 
+    flush_frames(vf);
+    talloc_free(p->pool);
+    p->pool = NULL;
+
     p->params = *in;
+
+    p->current_rt_format = VA_RT_FORMAT_YUV420;
+    p->pool = mp_image_pool_new(20);
+    va_pool_set_allocator(p->pool, p->va, p->current_rt_format);
+
+    struct mp_image *probe = mp_image_pool_get(p->pool, IMGFMT_VAAPI, in->w, in->h);
+    if (!probe)
+        return -1;
+    va_surface_init_subformat(probe);
     *out = *in;
-    out->imgfmt = IMGFMT_VAAPI;
-    flush_frames(vf);
+    out->imgfmt = probe->params.imgfmt;
+    out->hw_subfmt = probe->params.hw_subfmt;
+    talloc_free(probe);
+
     return 0;
 }
 
@@ -368,6 +336,7 @@ static void uninit(struct vf_instance *vf)
         vaDestroyConfig(p->display, p->config);
     talloc_free(p->pool);
     flush_frames(vf);
+    mp_refqueue_free(p->queue);
 }
 
 static int query_format(struct vf_instance *vf, unsigned int imgfmt)
@@ -476,19 +445,20 @@ static bool initialize(struct vf_instance *vf)
 
 static int vf_open(vf_instance_t *vf)
 {
+    struct vf_priv_s *p = vf->priv;
+
     vf->reconfig = reconfig;
     vf->filter_ext = filter_ext;
+    vf->filter_out = filter_out;
     vf->query_format = query_format;
     vf->uninit = uninit;
     vf->control = control;
 
-    struct vf_priv_s *p = vf->priv;
-    if (!vf->hwdec)
-        return false;
-    hwdec_request_api(vf->hwdec, "vaapi");
-    p->va = vf->hwdec->hwctx ? vf->hwdec->hwctx->vaapi_ctx : NULL;
-    if (!p->va || !p->va->display)
-        return false;
+    p->queue = mp_refqueue_alloc();
+
+    p->va = hwdec_devices_load(vf->hwdec_devs, HWDEC_VAAPI);
+    if (!p->va)
+        return 0;
     p->display = p->va->display;
     if (initialize(vf))
         return true;
diff --git a/video/filter/vf_vdpaupp.c b/video/filter/vf_vdpaupp.c
index 882b80d..92a40ec 100644
--- a/video/filter/vf_vdpaupp.c
+++ b/video/filter/vf_vdpaupp.c
@@ -1,20 +1,18 @@
 /*
  * This file is part of mpv.
  *
- * Parts based on fragments of vo_vdpau.c: Copyright (C) 2009 Uoti Urpala
- *
- * mpv is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * mpv is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
+ * GNU Lesser General Public License for more details.
  *
- * You should have received a copy of the GNU General Public License along
- * with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #include <stdio.h>
@@ -33,6 +31,7 @@
 #include "video/vdpau.h"
 #include "video/vdpau_mixer.h"
 #include "vf.h"
+#include "refqueue.h"
 
 // Note: this filter does no actual filtering; it merely sets appropriate
 //       flags on vdpau images (mp_vdpau_mixer_frame) to do the appropriate
@@ -40,13 +39,7 @@
 
 struct vf_priv_s {
     struct mp_vdpau_ctx *ctx;
-
-    // This is needed to supply past/future fields and to calculate the
-    // interpolated timestamp.
-    struct mp_image *buffered[3];
-    int num_buffered;
-
-    int prev_pos;           // last field that was output
+    struct mp_refqueue *queue;
 
     int def_deintmode;
     int deint_enabled;
@@ -54,130 +47,92 @@ struct vf_priv_s {
     struct mp_vdpau_mixer_opts opts;
 };
 
-static void forget_frames(struct vf_instance *vf)
+static int filter_ext(struct vf_instance *vf, struct mp_image *mpi)
 {
     struct vf_priv_s *p = vf->priv;
-    for (int n = 0; n < p->num_buffered; n++)
-        talloc_free(p->buffered[n]);
-    p->num_buffered = 0;
-    p->prev_pos = 0;
-}
 
-#define FIELD_VALID(p, f) ((f) >= 0 && (f) < (p)->num_buffered * 2)
+    if (p->opts.deint >= 2) {
+        mp_refqueue_set_refs(p->queue, 1, 1); // 2 past fields, 1 future field
+    } else {
+        mp_refqueue_set_refs(p->queue, 0, 0);
+    }
+    mp_refqueue_set_mode(p->queue,
+        (p->deint_enabled ? MP_MODE_DEINT : 0) |
+        (p->interlaced_only ? MP_MODE_INTERLACED_ONLY : 0) |
+        (p->opts.deint >= 2 ? MP_MODE_OUTPUT_FIELDS : 0));
+
+    if (mpi) {
+        struct mp_image *new = mp_vdpau_upload_video_surface(p->ctx, mpi);
+        talloc_free(mpi);
+        if (!new)
+            return -1;
+        mpi = new;
+
+        if (mp_vdpau_mixed_frame_get(mpi)) {
+            MP_ERR(vf, "Can't apply vdpaupp filter multiple times.\n");
+            vf_add_output_frame(vf, mpi);
+            return -1;
+        }
+    }
+
+    mp_refqueue_add_input(p->queue, mpi);
+    return 0;
+}
 
 static VdpVideoSurface ref_field(struct vf_priv_s *p,
                                  struct mp_vdpau_mixer_frame *frame, int pos)
 {
-    if (!FIELD_VALID(p, pos))
-        return VDP_INVALID_HANDLE;
-    struct mp_image *mpi = mp_image_new_ref(p->buffered[pos / 2]);
+    struct mp_image *mpi = mp_image_new_ref(mp_refqueue_get_field(p->queue, pos));
     if (!mpi)
         return VDP_INVALID_HANDLE;
     talloc_steal(frame, mpi);
     return (uintptr_t)mpi->planes[3];
 }
 
-// pos==0 means last field of latest frame, 1 earlier field of latest frame,
-// 2 last field of previous frame and so on
-static bool output_field(struct vf_instance *vf, int pos, bool deint)
+static int filter_out(struct vf_instance *vf)
 {
     struct vf_priv_s *p = vf->priv;
 
-    if (!FIELD_VALID(p, pos))
-        return false;
+    if (!mp_refqueue_has_output(p->queue))
+        return 0;
 
-    struct mp_image *mpi = mp_vdpau_mixed_frame_create(p->buffered[pos / 2]);
+    struct mp_image *mpi =
+        mp_vdpau_mixed_frame_create(mp_refqueue_get_field(p->queue, 0));
     if (!mpi)
-        return false; // skip output on OOM
+        return -1; // OOM
     struct mp_vdpau_mixer_frame *frame = mp_vdpau_mixed_frame_get(mpi);
 
-    frame->field = VDP_VIDEO_MIXER_PICTURE_STRUCTURE_FRAME;
-    if (p->opts.deint && deint) {
-        int top_field_first = !!(mpi->fields & MP_IMGFIELD_TOP_FIRST);
-        frame->field = top_field_first ^ (pos & 1) ?
-            VDP_VIDEO_MIXER_PICTURE_STRUCTURE_BOTTOM_FIELD:
-            VDP_VIDEO_MIXER_PICTURE_STRUCTURE_TOP_FIELD;
+    if (!mp_refqueue_should_deint(p->queue)) {
+        frame->field = VDP_VIDEO_MIXER_PICTURE_STRUCTURE_FRAME;
+    } else if (mp_refqueue_is_top_field(p->queue)) {
+        frame->field = VDP_VIDEO_MIXER_PICTURE_STRUCTURE_TOP_FIELD;
+    } else {
+        frame->field = VDP_VIDEO_MIXER_PICTURE_STRUCTURE_BOTTOM_FIELD;
     }
 
-    frame->future[0] = ref_field(p, frame, pos - 1);
-    frame->current = ref_field(p, frame, pos);
-    frame->past[0] = ref_field(p, frame, pos + 1);
-    frame->past[1] = ref_field(p, frame, pos + 2);
+    frame->future[0] = ref_field(p, frame, 1);
+    frame->current = ref_field(p, frame, 0);
+    frame->past[0] = ref_field(p, frame, -1);
+    frame->past[1] = ref_field(p, frame, -2);
 
     frame->opts = p->opts;
 
     mpi->planes[3] = (void *)(uintptr_t)frame->current;
 
-    // Interpolate timestamps of extra fields (these always have even indexes)
-    int idx = pos / 2;
-    if (idx > 0 && !(pos & 1) && p->opts.deint >= 2 && deint) {
-        double pts1 = p->buffered[idx - 1]->pts;
-        double pts2 = p->buffered[idx]->pts;
-        double diff = pts1 - pts2;
-        mpi->pts = diff > 0 && diff < 0.5 ? (pts1 + pts2) / 2 : pts2;
-    }
+    mp_refqueue_next_field(p->queue);
 
     vf_add_output_frame(vf, mpi);
-    return true;
-}
-
-static int filter_ext(struct vf_instance *vf, struct mp_image *mpi)
-{
-    struct vf_priv_s *p = vf->priv;
-    int maxbuffer = p->opts.deint >= 2 ? 3 : 2;
-    bool eof = !mpi;
-
-    if (mpi) {
-        struct mp_image *new = mp_vdpau_upload_video_surface(p->ctx, mpi);
-        talloc_free(mpi);
-        if (!new)
-            return -1;
-        mpi = new;
-
-        if (mp_vdpau_mixed_frame_get(mpi)) {
-            MP_ERR(vf, "Can't apply vdpaupp filter multiple times.\n");
-            vf_add_output_frame(vf, mpi);
-            return -1;
-        }
-
-        while (p->num_buffered >= maxbuffer) {
-            talloc_free(p->buffered[p->num_buffered - 1]);
-            p->num_buffered--;
-        }
-        for (int n = p->num_buffered; n > 0; n--)
-            p->buffered[n] = p->buffered[n - 1];
-        p->buffered[0] = mpi;
-        p->num_buffered++;
-        p->prev_pos += 2;
-    }
-
-    bool deint = (mpi && (mpi->fields & MP_IMGFIELD_INTERLACED)) || !p->interlaced_only;
-
-    while (1) {
-        int current = p->prev_pos - 1;
-        if (!FIELD_VALID(p, current))
-            break;
-        // No field-splitting deinterlace -> only output first field (odd index)
-        if ((current & 1) || (deint && p->opts.deint >= 2)) {
-            // Wait for enough future frames being buffered.
-            // (Past frames are always around if available at all.)
-            if (!eof && !FIELD_VALID(p, current - 1))
-                break;
-            if (!output_field(vf, current, deint))
-                break;
-        }
-        p->prev_pos = current;
-    }
-
     return 0;
 }
 
 static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
                     struct mp_image_params *out)
 {
-    forget_frames(vf);
+    struct vf_priv_s *p = vf->priv;
+    mp_refqueue_flush(p->queue);
     *out = *in;
     out->imgfmt = IMGFMT_VDPAU;
+    out->hw_subfmt = 0;
     return 0;
 }
 
@@ -194,7 +149,7 @@ static int control(vf_instance_t *vf, int request, void *data)
 
     switch (request) {
     case VFCTRL_SEEK_RESET:
-        forget_frames(vf);
+        mp_refqueue_flush(p->queue);
         return CONTROL_OK;
     case VFCTRL_GET_DEINTERLACE:
         *(int *)data = !!p->deint_enabled;
@@ -209,7 +164,9 @@ static int control(vf_instance_t *vf, int request, void *data)
 
 static void uninit(struct vf_instance *vf)
 {
-    forget_frames(vf);
+    struct vf_priv_s *p = vf->priv;
+
+    mp_refqueue_free(p->queue);
 }
 
 static int vf_open(vf_instance_t *vf)
@@ -218,15 +175,14 @@ static int vf_open(vf_instance_t *vf)
 
     vf->reconfig = reconfig;
     vf->filter_ext = filter_ext;
-    vf->filter = NULL;
+    vf->filter_out = filter_out;
     vf->query_format = query_format;
     vf->control = control;
     vf->uninit = uninit;
 
-    if (!vf->hwdec)
-        return 0;
-    hwdec_request_api(vf->hwdec, "vdpau");
-    p->ctx = vf->hwdec->hwctx ? vf->hwdec->hwctx->vdpau_ctx : NULL;
+    p->queue = mp_refqueue_alloc();
+
+    p->ctx = hwdec_devices_load(vf->hwdec_devs, HWDEC_VDPAU);
     if (!p->ctx)
         return 0;
 
diff --git a/video/filter/vf_vdpaurb.c b/video/filter/vf_vdpaurb.c
index 62f7f34..2e6da79 100644
--- a/video/filter/vf_vdpaurb.c
+++ b/video/filter/vf_vdpaurb.c
@@ -35,10 +35,7 @@ struct vf_priv_s {
 
 static int filter_ext(struct vf_instance *vf, struct mp_image *mpi)
 {
-    VdpStatus vdp_st;
     struct vf_priv_s *p = vf->priv;
-    struct mp_vdpau_ctx *ctx = p->ctx;
-    struct vdp_functions *vdp = &ctx->vdp;
 
     if (!mpi) {
         return 0;
@@ -56,21 +53,14 @@ static int filter_ext(struct vf_instance *vf, struct mp_image *mpi)
         return -1;
     }
 
-    struct mp_image *out = vf_alloc_out_image(vf);
-    if (!out) {
+    struct mp_hwdec_ctx *hwctx = &p->ctx->hwctx;
+
+    struct mp_image *out = hwctx->download_image(hwctx, mpi, vf->out_pool);
+    if (!out || out->imgfmt != IMGFMT_NV12) {
         mp_image_unrefp(&mpi);
+        mp_image_unrefp(&out);
         return -1;
     }
-    mp_image_copy_attributes(out, mpi);
-
-    VdpVideoSurface surface = (uintptr_t)mpi->planes[3];
-    assert(surface > 0);
-
-    vdp_st = vdp->video_surface_get_bits_y_cb_cr(surface,
-                                                 VDP_YCBCR_FORMAT_NV12,
-                                                 (void * const *)out->planes,
-                                                 out->stride);
-    CHECK_VDP_WARNING(vf, "Error when calling vdp_output_surface_get_bits_y_cb_cr");
 
     vf_add_output_frame(vf, out);
     mp_image_unrefp(&mpi);
@@ -83,6 +73,7 @@ static int reconfig(struct vf_instance *vf, struct mp_image_params *in,
     *out = *in;
     if (in->imgfmt == IMGFMT_VDPAU) {
         out->imgfmt = IMGFMT_NV12;
+        out->hw_subfmt = 0;
     }
     return 0;
 }
@@ -101,14 +92,9 @@ static int vf_open(vf_instance_t *vf)
     vf->reconfig = reconfig;
     vf->query_format = query_format;
 
-    if (!vf->hwdec) {
+    p->ctx = hwdec_devices_load(vf->hwdec_devs, HWDEC_VDPAU);
+    if (!p->ctx)
         return 0;
-    }
-    hwdec_request_api(vf->hwdec, "vdpau");
-    p->ctx = vf->hwdec->hwctx ? vf->hwdec->hwctx->vdpau_ctx : NULL;
-    if (!p->ctx) {
-        return 0;
-    }
 
     return 1;
 }
diff --git a/video/fmt-conversion.c b/video/fmt-conversion.c
index 5334206..1fca8bf 100644
--- a/video/fmt-conversion.c
+++ b/video/fmt-conversion.c
@@ -109,13 +109,17 @@ static const struct {
 #endif
     {IMGFMT_VAAPI, AV_PIX_FMT_VAAPI_VLD},
     {IMGFMT_DXVA2, AV_PIX_FMT_DXVA2_VLD},
-#if HAVE_D3D11VA_HWACCEL
+#if HAVE_D3D_HWACCEL
     {IMGFMT_D3D11VA, AV_PIX_FMT_D3D11VA_VLD},
 #endif
 #if HAVE_AV_PIX_FMT_MMAL
     {IMGFMT_MMAL, AV_PIX_FMT_MMAL},
 #endif
 
+#ifdef AV_PIX_FMT_P010
+    {IMGFMT_P010, AV_PIX_FMT_P010},
+#endif
+
     {0, AV_PIX_FMT_NONE}
 };
 
diff --git a/video/hwdec.c b/video/hwdec.c
new file mode 100644
index 0000000..6db8d57
--- /dev/null
+++ b/video/hwdec.c
@@ -0,0 +1,90 @@
+#include <pthread.h>
+#include <assert.h>
+
+#include "hwdec.h"
+
+struct mp_hwdec_devices {
+    pthread_mutex_t lock;
+
+    struct mp_hwdec_ctx *hwctx;
+
+    void (*load_api)(void *ctx, enum hwdec_type type);
+    void *load_api_ctx;
+};
+
+struct mp_hwdec_devices *hwdec_devices_create(void)
+{
+    struct mp_hwdec_devices *devs = talloc_zero(NULL, struct mp_hwdec_devices);
+    pthread_mutex_init(&devs->lock, NULL);
+    return devs;
+}
+
+void hwdec_devices_destroy(struct mp_hwdec_devices *devs)
+{
+    if (!devs)
+        return;
+    assert(!devs->hwctx); // must have been hwdec_devices_remove()ed
+    assert(!devs->load_api); // must have been unset
+    pthread_mutex_destroy(&devs->lock);
+    talloc_free(devs);
+}
+
+struct mp_hwdec_ctx *hwdec_devices_get(struct mp_hwdec_devices *devs,
+                                       enum hwdec_type type)
+{
+    struct mp_hwdec_ctx *res = NULL;
+    pthread_mutex_lock(&devs->lock);
+    if (devs->hwctx && devs->hwctx->type == type)
+        res = devs->hwctx;
+    pthread_mutex_unlock(&devs->lock);
+    return res;
+}
+
+struct mp_hwdec_ctx *hwdec_devices_get_first(struct mp_hwdec_devices *devs)
+{
+    pthread_mutex_lock(&devs->lock);
+    struct mp_hwdec_ctx *res = devs->hwctx;
+    pthread_mutex_unlock(&devs->lock);
+    return res;
+}
+
+void hwdec_devices_add(struct mp_hwdec_devices *devs, struct mp_hwdec_ctx *ctx)
+{
+    pthread_mutex_lock(&devs->lock);
+    // We support only 1 device; ignore the rest.
+    if (!devs->hwctx)
+        devs->hwctx = ctx;
+    pthread_mutex_unlock(&devs->lock);
+}
+
+void hwdec_devices_remove(struct mp_hwdec_devices *devs, struct mp_hwdec_ctx *ctx)
+{
+    pthread_mutex_lock(&devs->lock);
+    if (devs->hwctx == ctx)
+        devs->hwctx = NULL;
+    pthread_mutex_unlock(&devs->lock);
+}
+
+void hwdec_devices_set_loader(struct mp_hwdec_devices *devs,
+    void (*load_api)(void *ctx, enum hwdec_type type), void *load_api_ctx)
+{
+    devs->load_api = load_api;
+    devs->load_api_ctx = load_api_ctx;
+}
+
+// Cause VO to lazily load the requested device, and will block until this is
+// done (even if not available).
+void hwdec_devices_request(struct mp_hwdec_devices *devs, enum hwdec_type type)
+{
+    if (devs->load_api && !hwdec_devices_get_first(devs))
+        devs->load_api(devs->load_api_ctx, type);
+}
+
+void *hwdec_devices_load(struct mp_hwdec_devices *devs, enum hwdec_type type)
+{
+    if (!devs)
+        return NULL;
+    hwdec_devices_request(devs, type);
+    struct mp_hwdec_ctx *hwctx = hwdec_devices_get(devs, type);
+    return hwctx ? hwctx->ctx : NULL;
+}
diff --git a/video/hwdec.h b/video/hwdec.h
index 898b035..5d563c9 100644
--- a/video/hwdec.h
+++ b/video/hwdec.h
@@ -7,32 +7,38 @@ struct mp_image_pool;
 
 // keep in sync with --hwdec option (see mp_hwdec_names)
 enum hwdec_type {
-    HWDEC_AUTO = -1,
     HWDEC_NONE = 0,
-    HWDEC_VDPAU = 1,
-    HWDEC_VIDEOTOOLBOX = 3,
-    HWDEC_VAAPI = 4,
-    HWDEC_VAAPI_COPY = 5,
-    HWDEC_DXVA2 = 6,
-    HWDEC_DXVA2_COPY = 7,
-    HWDEC_D3D11VA_COPY = 8,
-    HWDEC_RPI = 9,
-    HWDEC_MEDIACODEC = 10,
+    HWDEC_AUTO,
+    HWDEC_AUTO_COPY,
+    HWDEC_VDPAU,
+    HWDEC_VIDEOTOOLBOX,
+    HWDEC_VAAPI,
+    HWDEC_VAAPI_COPY,
+    HWDEC_DXVA2,
+    HWDEC_DXVA2_COPY,
+    HWDEC_D3D11VA,
+    HWDEC_D3D11VA_COPY,
+    HWDEC_RPI,
+    HWDEC_MEDIACODEC,
 };
 
+#define HWDEC_IS_AUTO(x) ((x) == HWDEC_AUTO || (x) == HWDEC_AUTO_COPY)
+
 // hwdec_type names (options.c)
 extern const struct m_opt_choice_alternatives mp_hwdec_names[];
 
 struct mp_hwdec_ctx {
-    enum hwdec_type type;
-
-    void *priv; // for free use by hwdec implementation
+    enum hwdec_type type; // (never HWDEC_NONE or HWDEC_IS_AUTO)
+    const char *driver_name; // NULL if unknown/not loaded
 
-    // API-specific, not needed by all backends.
-    struct mp_vdpau_ctx *vdpau_ctx;
-    struct mp_vaapi_ctx *vaapi_ctx;
-    struct mp_d3d_ctx *d3d_ctx;
-    uint32_t (*get_vt_fmt)(struct mp_hwdec_ctx *ctx);
+    // This is never NULL. Its meaning depends on the .type field:
+    //  HWDEC_VDPAU:            struct mp_vaapi_ctx*
+    //  HWDEC_VIDEOTOOLBOX:     struct mp_vt_ctx*
+    //  HWDEC_VAAPI:            struct mp_vaapi_ctx*
+    //  HWDEC_D3D11VA:          ID3D11Device*
+    //  HWDEC_DXVA2:            IDirect3DDevice9*
+    //  HWDEC_DXVA2_COPY:       IDirect3DDevice9*
+    void *ctx;
 
     // Optional.
     // Allocates a software image from the pool, downloads the hw image from
@@ -44,24 +50,50 @@ struct mp_hwdec_ctx {
                                        struct mp_image_pool *swpool);
 };
 
-// Used to communicate hardware decoder API handles from VO to video decoder.
-// The VO can set the context pointer for supported APIs.
-struct mp_hwdec_info {
-    // (Since currently only 1 hwdec API is loaded at a time, this pointer
-    // simply maps to the loaded one.)
-    struct mp_hwdec_ctx *hwctx;
-
-    // Can be used to lazily load a requested API.
-    // api_name is e.g. "vdpau" (like the fields above, without "_ctx")
-    // Can be NULL, is idempotent, caller checks hwctx fields for success/access.
-    // Due to threading, the callback is the only code that is allowed to
-    // change fields in this struct after initialization.
-    void (*load_api)(struct mp_hwdec_info *info, const char *api_name);
-    void *load_api_ctx;
+struct mp_vt_ctx {
+    void *priv;
+    uint32_t (*get_vt_fmt)(struct mp_vt_ctx *ctx);
 };
 
-// Trivial helper to call info->load_api().
-// Implemented in vd_lavc.c.
-void hwdec_request_api(struct mp_hwdec_info *info, const char *api_name);
+// Used to communicate hardware decoder device handles from VO to video decoder.
+struct mp_hwdec_devices;
+
+struct mp_hwdec_devices *hwdec_devices_create(void);
+void hwdec_devices_destroy(struct mp_hwdec_devices *devs);
+
+// Return the device context for the given API type. Returns NULL if none
+// available. Logically, the returned pointer remains valid until VO
+// uninitialization is started (all users of it must be uninitialized before).
+// hwdec_devices_request() may be used before this to lazily load devices.
+struct mp_hwdec_ctx *hwdec_devices_get(struct mp_hwdec_devices *devs,
+                                       enum hwdec_type type);
+
+// For code which still strictly assumes there is 1 (or none) device.
+struct mp_hwdec_ctx *hwdec_devices_get_first(struct mp_hwdec_devices *devs);
+
+// Add this to the list of internal devices. Adding the same pointer twice must
+// be avoided.
+void hwdec_devices_add(struct mp_hwdec_devices *devs, struct mp_hwdec_ctx *ctx);
+
+// Remove this from the list of internal devices. Idempotent/ignores entries
+// not added yet.
+void hwdec_devices_remove(struct mp_hwdec_devices *devs, struct mp_hwdec_ctx *ctx);
+
+// Can be used to enable lazy loading of an API with hwdec_devices_request().
+// If used at all, this must be set/unset during initialization/uninitialization,
+// as concurrent use with hwdec_devices_request() is a race condition.
+void hwdec_devices_set_loader(struct mp_hwdec_devices *devs,
+    void (*load_api)(void *ctx, enum hwdec_type type), void *load_api_ctx);
+
+// Cause VO to lazily load the requested device, and will block until this is
+// done (even if not available).
+void hwdec_devices_request(struct mp_hwdec_devices *devs, enum hwdec_type type);
+
+// Convenience function:
+// - return NULL if devs==NULL
+// - call hwdec_devices_request(devs, type)
+// - call hwdec_devices_get(devs, type)
+// - then return the mp_hwdec_ctx.ctx field
+void *hwdec_devices_load(struct mp_hwdec_devices *devs, enum hwdec_type type);
 
 #endif
diff --git a/video/image_writer.c b/video/image_writer.c
index 6c1c994..5ba89c8 100644
--- a/video/image_writer.c
+++ b/video/image_writer.c
@@ -136,9 +136,21 @@ static bool write_lavc(struct image_writer_ctx *ctx, mp_image_t *image, FILE *fp
         pic->color_primaries = mp_csp_prim_to_avcol_pri(image->params.primaries);
         pic->color_trc = mp_csp_trc_to_avcol_trc(image->params.gamma);
     }
+
+#if HAVE_AVCODEC_NEW_CODEC_API
+    int ret = avcodec_send_frame(avctx, pic);
+    if (ret < 0)
+        goto error_exit;
+    avcodec_send_frame(avctx, NULL); // send EOF
+    ret = avcodec_receive_packet(avctx, &pkt);
+    if (ret < 0)
+        goto error_exit;
+    got_output = 1;
+#else
     int ret = avcodec_encode_video2(avctx, &pkt, pic, &got_output);
     if (ret < 0)
         goto error_exit;
+#endif
 
     fwrite(pkt.data, pkt.size, 1, fp);
 
diff --git a/video/img_format.c b/video/img_format.c
index fe2ca14..24545a8 100644
--- a/video/img_format.c
+++ b/video/img_format.c
@@ -36,6 +36,8 @@ struct mp_imgfmt_entry {
 static const struct mp_imgfmt_entry mp_imgfmt_list[] = {
     // not in ffmpeg
     {"vdpau_output",    IMGFMT_VDPAU_OUTPUT},
+    {"d3d11_nv12",      IMGFMT_D3D11NV12},
+    {"d3d11_rgb",       IMGFMT_D3D11RGB},
     // FFmpeg names have an annoying "_vld" suffix
     {"videotoolbox",    IMGFMT_VIDEOTOOLBOX},
     {"vaapi",           IMGFMT_VAAPI},
@@ -120,12 +122,20 @@ static struct mp_imgfmt_desc mp_only_imgfmt_desc(int mpfmt)
 {
     switch (mpfmt) {
     case IMGFMT_VDPAU_OUTPUT:
+    case IMGFMT_D3D11RGB:
         return (struct mp_imgfmt_desc) {
             .id = mpfmt,
             .avformat = AV_PIX_FMT_NONE,
             .flags = MP_IMGFLAG_BE | MP_IMGFLAG_LE | MP_IMGFLAG_RGB |
                      MP_IMGFLAG_HWACCEL,
         };
+    case IMGFMT_D3D11NV12:
+        return (struct mp_imgfmt_desc) {
+            .id = mpfmt,
+            .avformat = AV_PIX_FMT_NONE,
+            .flags = MP_IMGFLAG_BE | MP_IMGFLAG_LE | MP_IMGFLAG_YUV |
+                     MP_IMGFLAG_HWACCEL,
+        };
     }
     return (struct mp_imgfmt_desc) {0};
 }
diff --git a/video/img_format.h b/video/img_format.h
index 605dc92..b6f5830 100644
--- a/video/img_format.h
+++ b/video/img_format.h
@@ -157,6 +157,9 @@ enum mp_imgfmt {
     IMGFMT_NV12,
     IMGFMT_NV21,
 
+    // Like IMGFMT_NV12, but with 16 bits per component
+    IMGFMT_P010,
+
     // RGB/BGR Formats
 
     // Byte accessed (low address to high address)
@@ -195,7 +198,17 @@ enum mp_imgfmt {
     IMGFMT_VDPAU,           // VdpVideoSurface
     IMGFMT_VDPAU_OUTPUT,    // VdpOutputSurface
     IMGFMT_VAAPI,
-    IMGFMT_D3D11VA,         // ID3D11VideoDecoderOutputView (NV12/P010/P016)
+    // NV12/P010/P016
+    // plane 1: ID3D11Texture2D
+    // plane 2: slice index casted to pointer
+    // plane 3: ID3D11VideoDecoderOutputView (can be absent in filters/VO)
+    IMGFMT_D3D11VA,
+    // Like IMGFMT_D3D11VA, but format is restricted to NV12.
+    IMGFMT_D3D11NV12,
+    // Like IMGFMT_D3D11VA, but format is restricted to a certain RGB format.
+    // Also, it must have a share handle, have been flushed, and not be a
+    // texture array slice.
+    IMGFMT_D3D11RGB,
     IMGFMT_DXVA2,           // IDirect3DSurface9 (NV12/P010/P016)
     IMGFMT_MMAL,            // MMAL_BUFFER_HEADER_T
     IMGFMT_VIDEOTOOLBOX,    // CVPixelBufferRef
diff --git a/video/mp_image.c b/video/mp_image.c
index 565de18..d5b9748 100644
--- a/video/mp_image.c
+++ b/video/mp_image.c
@@ -81,12 +81,13 @@ static bool mp_image_alloc_planes(struct mp_image *mpi)
 
 void mp_image_setfmt(struct mp_image *mpi, int out_fmt)
 {
+    struct mp_image_params params = mpi->params;
     struct mp_imgfmt_desc fmt = mp_imgfmt_get_desc(out_fmt);
-    mpi->params.imgfmt = fmt.id;
+    params.imgfmt = fmt.id;
     mpi->fmt = fmt;
     mpi->imgfmt = fmt.id;
     mpi->num_planes = fmt.num_planes;
-    mp_image_set_size(mpi, mpi->w, mpi->h);
+    mpi->params = params;
 }
 
 static void mp_image_destructor(void *ptr)
@@ -94,6 +95,7 @@ static void mp_image_destructor(void *ptr)
     mp_image_t *mpi = ptr;
     for (int p = 0; p < MP_MAX_PLANES; p++)
         av_buffer_unref(&mpi->bufs[p]);
+    av_buffer_unref(&mpi->hwctx);
 }
 
 int mp_chroma_div_up(int size, int shift)
@@ -119,7 +121,6 @@ void mp_image_set_size(struct mp_image *mpi, int w, int h)
     assert(w >= 0 && h >= 0);
     mpi->w = mpi->params.w = w;
     mpi->h = mpi->params.h = h;
-    mpi->params.p_w = mpi->params.p_h = 1;
 }
 
 void mp_image_set_params(struct mp_image *image,
@@ -163,17 +164,12 @@ void mp_image_steal_data(struct mp_image *dst, struct mp_image *src)
     assert(dst->imgfmt == src->imgfmt && dst->w == src->w && dst->h == src->h);
     assert(dst->bufs[0] && src->bufs[0]);
 
-    for (int p = 0; p < MP_MAX_PLANES; p++) {
-        dst->planes[p] = src->planes[p];
-        dst->stride[p] = src->stride[p];
-    }
-    mp_image_copy_attributes(dst, src);
+    mp_image_destructor(dst); // unref old
+    talloc_free_children(dst);
 
-    for (int p = 0; p < MP_MAX_PLANES; p++) {
-        av_buffer_unref(&dst->bufs[p]);
-        dst->bufs[p] = src->bufs[p];
-        src->bufs[p] = NULL;
-    }
+    *dst = *src;
+
+    *src = (struct mp_image){0};
     talloc_free(src);
 }
 
@@ -199,6 +195,11 @@ struct mp_image *mp_image_new_ref(struct mp_image *img)
                 fail = true;
         }
     }
+    if (new->hwctx) {
+        new->hwctx = av_buffer_ref(new->hwctx);
+        if (!new->hwctx)
+            fail = true;
+    }
 
     if (!fail)
         return new;
@@ -229,9 +230,10 @@ struct mp_image *mp_image_new_dummy_ref(struct mp_image *img)
 {
     struct mp_image *new = talloc_ptrtype(NULL, new);
     talloc_set_destructor(new, mp_image_destructor);
-    *new = *img;
+    *new = img ? *img : (struct mp_image){0};
     for (int p = 0; p < MP_MAX_PLANES; p++)
         new->bufs[p] = NULL;
+    new->hwctx = NULL;
     return new;
 }
 
@@ -539,7 +541,7 @@ bool mp_image_params_valid(const struct mp_image_params *p)
     if (p->w <= 0 || p->h <= 0 || (p->w + 128LL) * (p->h + 128LL) >= INT_MAX / 8)
         return false;
 
-    if (p->p_w <= 0 || p->p_h <= 0)
+    if (p->p_w < 0 || p->p_h < 0)
         return false;
 
     if (p->rotate < 0 || p->rotate >= 360)
@@ -566,6 +568,7 @@ bool mp_image_params_equal(const struct mp_image_params *p1,
            p1->colorlevels == p2->colorlevels &&
            p1->primaries == p2->primaries &&
            p1->gamma == p2->gamma &&
+           p1->peak == p2->peak &&
            p1->chroma_location == p2->chroma_location &&
            p1->rotate == p2->rotate &&
            p1->stereo_in == p2->stereo_in &&
@@ -660,16 +663,25 @@ void mp_image_params_guess_csp(struct mp_image_params *params)
         params->primaries = MP_CSP_PRIM_AUTO;
         params->gamma = MP_CSP_TRC_AUTO;
     }
+
+    // Guess the reference peak (independent of the colorspace)
+    if (params->gamma == MP_CSP_TRC_SMPTE_ST2084) {
+        if (!params->peak)
+            params->peak = 10000; // As per the spec
+    }
 }
 
 // Copy properties and data of the AVFrame into the mp_image, without taking
 // care of memory management issues.
-void mp_image_copy_fields_from_av_frame(struct mp_image *dst,
-                                        struct AVFrame *src)
+static void mp_image_copy_fields_from_av_frame(struct mp_image *dst,
+                                               struct AVFrame *src)
 {
     mp_image_setfmt(dst, pixfmt2imgfmt(src->format));
     mp_image_set_size(dst, src->width, src->height);
 
+    dst->params.p_w = src->sample_aspect_ratio.num;
+    dst->params.p_h = src->sample_aspect_ratio.den;
+
     for (int i = 0; i < 4; i++) {
         dst->planes[i] = src->data[i];
         dst->stride[i] = src->linesize[i];
@@ -688,13 +700,16 @@ void mp_image_copy_fields_from_av_frame(struct mp_image *dst,
 
 // Copy properties and data of the mp_image into the AVFrame, without taking
 // care of memory management issues.
-void mp_image_copy_fields_to_av_frame(struct AVFrame *dst,
-                                      struct mp_image *src)
+static void mp_image_copy_fields_to_av_frame(struct AVFrame *dst,
+                                             struct mp_image *src)
 {
     dst->format = imgfmt2pixfmt(src->imgfmt);
     dst->width = src->w;
     dst->height = src->h;
 
+    dst->sample_aspect_ratio.num = src->params.p_w;
+    dst->sample_aspect_ratio.den = src->params.p_h;
+
     for (int i = 0; i < 4; i++) {
         dst->data[i] = src->planes[i];
         dst->linesize[i] = src->stride[i];
@@ -720,34 +735,41 @@ struct mp_image *mp_image_from_av_frame(struct AVFrame *av_frame)
     mp_image_copy_fields_from_av_frame(&t, av_frame);
     for (int p = 0; p < MP_MAX_PLANES; p++)
         t.bufs[p] = av_frame->buf[p];
+#if HAVE_AVUTIL_HAS_HWCONTEXT
+    t.hwctx = av_frame->hw_frames_ctx;
+#endif
     return mp_image_new_ref(&t);
 }
 
 // Convert the mp_image reference to a AVFrame reference.
-// Warning: img is unreferenced (i.e. free'd). This is asymmetric to
-//          mp_image_from_av_frame(). It was done as some sort of optimization,
-//          but now these semantics are pointless.
-// On failure, img is only unreffed.
-struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img)
+struct AVFrame *mp_image_to_av_frame(struct mp_image *img)
 {
-    struct mp_image *new_ref = mp_image_new_ref(img); // ensure it's refcounted
-    talloc_free(img);
-    if (!new_ref)
-        return NULL;
+    struct mp_image *new_ref = mp_image_new_ref(img);
     AVFrame *frame = av_frame_alloc();
-    if (!frame) {
+    if (!frame || !new_ref) {
         talloc_free(new_ref);
+        av_frame_free(&frame);
         return NULL;
     }
     mp_image_copy_fields_to_av_frame(frame, new_ref);
-    for (int p = 0; p < MP_MAX_PLANES; p++) {
+    for (int p = 0; p < MP_MAX_PLANES; p++)
         frame->buf[p] = new_ref->bufs[p];
-        new_ref->bufs[p] = NULL;
-    }
+#if HAVE_AVUTIL_HAS_HWCONTEXT
+    frame->hw_frames_ctx = new_ref->hwctx;
+#endif
+    *new_ref = (struct mp_image){0};
     talloc_free(new_ref);
     return frame;
 }
 
+// Same as mp_image_to_av_frame(), but unref img. (It does so even on failure.)
+struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img)
+{
+    AVFrame *frame = mp_image_to_av_frame(img);
+    talloc_free(img);
+    return frame;
+}
+
 void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
                 int dstStride, int srcStride)
 {
diff --git a/video/mp_image.h b/video/mp_image.h
index c00c78a..18d2596 100644
--- a/video/mp_image.h
+++ b/video/mp_image.h
@@ -42,11 +42,12 @@ struct mp_image_params {
     uint64_t hw_subfmt;         // underlying format for some hwaccel pixfmts
                                 // (will use the HW API's format identifiers)
     int w, h;                   // image dimensions
-    int p_w, p_h;               // define pixel aspect ratio (never 0/0)
+    int p_w, p_h;               // define pixel aspect ratio (undefined: 0/0)
     enum mp_csp colorspace;
     enum mp_csp_levels colorlevels;
     enum mp_csp_prim primaries;
     enum mp_csp_trc gamma;
+    float peak; // 0 = auto/unknown
     enum mp_chroma_location chroma_location;
     // The image should be rotated clockwise (0-359 degrees).
     int rotate;
@@ -100,6 +101,8 @@ typedef struct mp_image {
     // All mp_* functions manage this automatically; do not mess with it.
     // (See also AVFrame.buf.)
     struct AVBufferRef *bufs[MP_MAX_PLANES];
+    // Points to AVHWFramesContext* (same as AVFrame.hw_frames_ctx)
+    struct AVBufferRef *hwctx;
 } mp_image_t;
 
 int mp_chroma_div_up(int size, int shift);
@@ -152,11 +155,8 @@ void mp_image_set_attributes(struct mp_image *image,
                              const struct mp_image_params *params);
 
 struct AVFrame;
-void mp_image_copy_fields_from_av_frame(struct mp_image *dst,
-                                        struct AVFrame *src);
-void mp_image_copy_fields_to_av_frame(struct AVFrame *dst,
-                                      struct mp_image *src);
 struct mp_image *mp_image_from_av_frame(struct AVFrame *av_frame);
+struct AVFrame *mp_image_to_av_frame(struct mp_image *img);
 struct AVFrame *mp_image_to_av_frame_and_unref(struct mp_image *img);
 
 void memcpy_pic(void *dst, const void *src, int bytesPerLine, int height,
diff --git a/video/out/bitmap_packer.c b/video/out/bitmap_packer.c
index 4896076..3f75a72 100644
--- a/video/out/bitmap_packer.c
+++ b/video/out/bitmap_packer.c
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <assert.h>
 #include <stdio.h>
+#include <limits.h>
 
 #include <libavutil/common.h>
 
@@ -46,10 +47,7 @@ void packer_reset(struct bitmap_packer *packer)
 void packer_get_bb(struct bitmap_packer *packer, struct pos out_bb[2])
 {
     out_bb[0] = (struct pos) {0};
-    out_bb[1] = (struct pos) {
-        FFMIN(packer->used_width + packer->padding, packer->w),
-        FFMIN(packer->used_height + packer->padding, packer->h),
-    };
+    out_bb[1] = (struct pos) {packer->used_width, packer->used_height};
 }
 
 #define HEIGHT_SORT_BITS 4
@@ -138,8 +136,12 @@ int packer_pack(struct bitmap_packer *packer)
     struct pos *in = packer->in;
     int xmax = 0, ymax = 0;
     for (int i = 0; i < packer->count; i++) {
-        if (in[i].x <= packer->padding || in[i].y <= packer->padding)
+        if (in[i].x <= 0 || in[i].y <= 0) {
             in[i] = (struct pos){0, 0};
+        } else {
+            in[i].x += packer->padding * 2;
+            in[i].y += packer->padding * 2;
+        }
         if (in[i].x < 0 || in [i].x > 65535 || in[i].y < 0 || in[i].y > 65535) {
             fprintf(stderr, "Invalid OSD / subtitle bitmap size\n");
             abort();
@@ -147,8 +149,6 @@ int packer_pack(struct bitmap_packer *packer)
         xmax = FFMAX(xmax, in[i].x);
         ymax = FFMAX(ymax, in[i].y);
     }
-    xmax = FFMAX(0, xmax - packer->padding);
-    ymax = FFMAX(0, ymax - packer->padding);
     if (xmax > packer->w)
         packer->w = 1 << (av_log2(xmax - 1) + 1);
     if (ymax > packer->h)
@@ -156,21 +156,27 @@ int packer_pack(struct bitmap_packer *packer)
     while (1) {
         int used_width = 0;
         int y = pack_rectangles(in, packer->result, packer->count,
-                                packer->w + packer->padding,
-                                packer->h + packer->padding,
+                                packer->w, packer->h,
                                 packer->scratch, &used_width);
         if (y >= 0) {
-            // No padding at edges
             packer->used_width = FFMIN(used_width, packer->w);
             packer->used_height = FFMIN(y, packer->h);
             assert(packer->w == 0 || IS_POWER_OF_2(packer->w));
             assert(packer->h == 0 || IS_POWER_OF_2(packer->h));
+            if (packer->padding) {
+                for (int i = 0; i < packer->count; i++) {
+                    packer->result[i].x += packer->padding;
+                    packer->result[i].y += packer->padding;
+                }
+            }
             return packer->w != w_orig || packer->h != h_orig;
         }
-        if (packer->w <= packer->h && packer->w != packer->w_max)
-            packer->w = FFMIN(packer->w * 2, packer->w_max);
-        else if (packer->h != packer->h_max)
-            packer->h = FFMIN(packer->h * 2, packer->h_max);
+        int w_max = packer->w_max > 0 ? packer->w_max : INT_MAX;
+        int h_max = packer->h_max > 0 ? packer->h_max : INT_MAX;
+        if (packer->w <= packer->h && packer->w != w_max)
+            packer->w = FFMIN(packer->w * 2, w_max);
+        else if (packer->h != h_max)
+            packer->h = FFMIN(packer->h * 2, h_max);
         else {
             packer->w = w_orig;
             packer->h = h_orig;
@@ -201,9 +207,8 @@ int packer_pack_from_subbitmaps(struct bitmap_packer *packer,
     if (b->format == SUBBITMAP_EMPTY)
         return 0;
     packer_set_size(packer, b->num_parts);
-    int a = packer->padding;
     for (int i = 0; i < b->num_parts; i++)
-        packer->in[i] = (struct pos){b->parts[i].w + a, b->parts[i].h + a};
+        packer->in[i] = (struct pos){b->parts[i].w, b->parts[i].h};
     return packer_pack(packer);
 }
 
diff --git a/video/out/bitmap_packer.h b/video/out/bitmap_packer.h
index b86c3ec..8fd2fce 100644
--- a/video/out/bitmap_packer.h
+++ b/video/out/bitmap_packer.h
@@ -23,7 +23,6 @@ struct bitmap_packer {
     int asize;
 };
 
-struct ass_image;
 struct sub_bitmaps;
 
 // Clear all internal state. Leave the following fields: w_max, h_max
diff --git a/video/out/cocoa/events_view.m b/video/out/cocoa/events_view.m
index 6fec712..4a0c4bf 100644
--- a/video/out/cocoa/events_view.m
+++ b/video/out/cocoa/events_view.m
@@ -358,8 +358,13 @@
 {
     NSPasteboard *pboard = [sender draggingPasteboard];
     if ([[pboard types] containsObject:NSURLPboardType]) {
-        NSURL *file_url = [NSURL URLFromPasteboard:pboard];
-        [self.adapter handleFilesArray:@[[file_url absoluteString]]];
+        NSArray *pbitems = [pboard readObjectsForClasses:@[[NSURL class]]
+                            options:@{}];
+        NSMutableArray* ar = [[[NSMutableArray alloc] init] autorelease];
+        for (NSURL* url in pbitems) {
+            [ar addObject:[url path]];
+        }
+        [self.adapter handleFilesArray:ar];
         return YES;
     } else if ([[pboard types] containsObject:NSFilenamesPboardType]) {
         NSArray *pbitems = [pboard propertyListForType:NSFilenamesPboardType];
diff --git a/video/out/cocoa/window.m b/video/out/cocoa/window.m
index 646281d..d89e296 100644
--- a/video/out/cocoa/window.m
+++ b/video/out/cocoa/window.m
@@ -56,6 +56,11 @@
     [self.adapter setNeedsResize];
 }
 
+- (void)windowDidChangeScreen:(NSNotification *)notification
+{
+    [self.adapter windowDidChangeScreen:notification];
+}
+
 - (void)windowDidChangeScreenProfile:(NSNotification *)notification
 {
     [self.adapter didChangeWindowedScreenProfile:[self screen]];
diff --git a/video/out/cocoa_common.m b/video/out/cocoa_common.m
index 30b832d..21e1246 100644
--- a/video/out/cocoa_common.m
+++ b/video/out/cocoa_common.m
@@ -49,6 +49,9 @@
 
 #include "common/msg.h"
 
+static CVReturn displayLinkCallback(CVDisplayLinkRef displayLink, const CVTimeStamp* now, 
+                                    const CVTimeStamp* outputTime, CVOptionFlags flagsIn, 
+                                    CVOptionFlags* flagsOut, void* displayLinkContext);
 static int vo_cocoa_fullscreen(struct vo *vo);
 static void cocoa_rm_fs_screen_profile_observer(struct vo_cocoa_state *s);
 static void cocoa_add_screen_reconfiguration_observer(struct vo *vo);
@@ -370,6 +373,7 @@ static void vo_cocoa_update_screens_pointers(struct vo *vo)
 static void vo_cocoa_update_screen_fps(struct vo *vo)
 {
     struct vo_cocoa_state *s = vo->cocoa;
+
     NSScreen *screen = vo->opts->fullscreen ? s->fs_screen : s->current_screen;
     NSDictionary* sinfo = [screen deviceDescription];
     NSNumber* sid = [sinfo objectForKey:@"NSScreenNumber"];
@@ -377,16 +381,24 @@ static void vo_cocoa_update_screen_fps(struct vo *vo)
 
     CVDisplayLinkRef link;
     CVDisplayLinkCreateWithCGDisplay(did, &link);
-    s->screen_fps = CVDisplayLinkGetActualOutputVideoRefreshPeriod(link);
+    CVDisplayLinkSetOutputCallback(link, &displayLinkCallback, NULL);
+    CVDisplayLinkStart(link);
+    CVDisplayLinkSetCurrentCGDisplay(link, did);
+
+    double display_period = CVDisplayLinkGetActualOutputVideoRefreshPeriod(link);
 
-    if (s->screen_fps == 0) {
+    if (display_period > 0) {
+        s->screen_fps = 1/display_period;
+    } else {
         // Fallback to using Nominal refresh rate from DisplayLink,
         // CVDisplayLinkGet *Actual* OutputVideoRefreshPeriod seems to
         // return 0 on some Apple devices. Use the nominal refresh period
         // instead.
         const CVTime t = CVDisplayLinkGetNominalOutputVideoRefreshPeriod(link);
-        if (!(t.flags & kCVTimeIsIndefinite))
+        if (!(t.flags & kCVTimeIsIndefinite)) {
             s->screen_fps = (t.timeScale / (double) t.timeValue);
+            MP_VERBOSE(vo, "Falling back to %f for display sync.\n", s->screen_fps);
+        }
     }
 
     CVDisplayLinkRelease(link);
@@ -394,6 +406,13 @@ static void vo_cocoa_update_screen_fps(struct vo *vo)
     flag_events(vo, VO_EVENT_WIN_STATE);
 }
 
+static CVReturn displayLinkCallback(CVDisplayLinkRef displayLink, const CVTimeStamp* now,
+                                    const CVTimeStamp* outputTime, CVOptionFlags flagsIn,
+                                    CVOptionFlags* flagsOut, void* displayLinkContext)
+{
+    return kCVReturnSuccess;
+}
+
 static void vo_cocoa_update_screen_info(struct vo *vo, struct mp_rect *out_rc)
 {
     struct vo_cocoa_state *s = vo->cocoa;
@@ -931,6 +950,11 @@ int vo_cocoa_control(struct vo *vo, int *events, int request, void *arg)
     [[EventsResponder sharedInstance] handleFilesArray:files];
 }
 
+- (void)windowDidChangeScreen:(NSNotification *)notification
+{
+    vo_cocoa_update_screen_info(self.vout, NULL);
+}
+
 - (void)didChangeWindowedScreenProfile:(NSScreen *)screen
 {
     flag_events(self.vout, VO_EVENT_ICC_PROFILE_CHANGED);
diff --git a/video/out/drm_common.c b/video/out/drm_common.c
index c105a14..a39db93 100644
--- a/video/out/drm_common.c
+++ b/video/out/drm_common.c
@@ -222,7 +222,7 @@ void kms_destroy(struct kms *kms)
 static void vt_switcher_sighandler(int sig)
 {
     unsigned char event = sig == RELEASE_SIGNAL ? EVT_RELEASE : EVT_ACQUIRE;
-    write(vt_switcher_pipe[1], &event, sizeof(event));
+    (void)write(vt_switcher_pipe[1], &event, sizeof(event));
 }
 
 static bool has_signal_installed(int signo)
@@ -312,7 +312,7 @@ void vt_switcher_release(struct vt_switcher *s,
 void vt_switcher_interrupt_poll(struct vt_switcher *s)
 {
     unsigned char event = EVT_INTERRUPT;
-    write(vt_switcher_pipe[1], &event, sizeof(event));
+    (void)write(vt_switcher_pipe[1], &event, sizeof(event));
 }
 
 void vt_switcher_destroy(struct vt_switcher *s)
diff --git a/video/out/opengl/angle_common.c b/video/out/opengl/angle_common.c
new file mode 100644
index 0000000..21cc924
--- /dev/null
+++ b/video/out/opengl/angle_common.c
@@ -0,0 +1,13 @@
+#include "angle_common.h"
+
+// Test if Direct3D11 can be used by us. Basically, this prevents trying to use
+// D3D11 on Win7, and then failing somewhere in the process.
+bool d3d11_check_decoding(ID3D11Device *dev)
+{
+    HRESULT hr;
+    // We assume that NV12 is always supported, if hw decoding is supported at
+    // all.
+    UINT supported = 0;
+    hr = ID3D11Device_CheckFormatSupport(dev, DXGI_FORMAT_NV12, &supported);
+    return !FAILED(hr) && (supported & D3D11_BIND_DECODER);
+}
diff --git a/video/out/opengl/angle_common.h b/video/out/opengl/angle_common.h
new file mode 100644
index 0000000..14ecd6a
--- /dev/null
+++ b/video/out/opengl/angle_common.h
@@ -0,0 +1,13 @@
+#ifndef MP_ANGLE_COMMON_H
+#define MP_ANGLE_COMMON_H
+
+#include <initguid.h>
+#include <assert.h>
+#include <windows.h>
+#include <d3d11.h>
+
+#include <stdbool.h>
+
+bool d3d11_check_decoding(ID3D11Device *dev);
+
+#endif
+\ No newline at end of file
diff --git a/video/out/opengl/angle_dynamic.c b/video/out/opengl/angle_dynamic.c
new file mode 100644
index 0000000..f4540c4
--- /dev/null
+++ b/video/out/opengl/angle_dynamic.c
@@ -0,0 +1,33 @@
+#include <pthread.h>
+#include <windows.h>
+
+#define ANGLE_NO_ALIASES
+#include "angle_dynamic.h"
+
+#include "common/common.h"
+
+#define ANGLE_DECL(NAME, VAR) \
+    VAR;
+ANGLE_FNS(ANGLE_DECL)
+
+static bool angle_loaded;
+static pthread_once_t angle_load_once = PTHREAD_ONCE_INIT;
+
+static void angle_do_load(void)
+{
+    // Note: we let this handle "leak", as the functions remain valid forever.
+    HANDLE angle_dll = LoadLibraryW(L"LIBEGL.DLL");
+    if (!angle_dll)
+        return;
+#define ANGLE_LOAD_ENTRY(NAME, VAR) \
+    MP_CONCAT(PFN_, NAME) = (void *)GetProcAddress(angle_dll, #NAME); \
+    if (!MP_CONCAT(PFN_, NAME)) return;
+    ANGLE_FNS(ANGLE_LOAD_ENTRY)
+    angle_loaded = true;
+}
+
+bool angle_load(void)
+{
+    pthread_once(&angle_load_once, angle_do_load);
+    return angle_loaded;
+}
diff --git a/video/out/opengl/angle_dynamic.h b/video/out/opengl/angle_dynamic.h
new file mode 100644
index 0000000..87ad85c
--- /dev/null
+++ b/video/out/opengl/angle_dynamic.h
@@ -0,0 +1,82 @@
+// Based on Khronos headers, thus MIT licensed.
+
+#ifndef MP_ANGLE_DYNAMIC_H
+#define MP_ANGLE_DYNAMIC_H
+
+#include <stdbool.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#define ANGLE_FNS(FN) \
+    FN(eglBindAPI, EGLBoolean (*EGLAPIENTRY PFN_eglBindAPI)(EGLenum)) \
+    FN(eglBindTexImage, EGLBoolean (*EGLAPIENTRY PFN_eglBindTexImage) \
+        (EGLDisplay, EGLSurface, EGLint)) \
+    FN(eglChooseConfig, EGLBoolean (*EGLAPIENTRY PFN_eglChooseConfig) \
+        (EGLDisplay, const EGLint *, EGLConfig *, EGLint, EGLint *)) \
+    FN(eglCreateContext, EGLContext (*EGLAPIENTRY PFN_eglCreateContext) \
+        (EGLDisplay, EGLConfig, EGLContext, const EGLint *)) \
+    FN(eglCreatePbufferFromClientBuffer, EGLSurface (*EGLAPIENTRY \
+        PFN_eglCreatePbufferFromClientBuffer)(EGLDisplay, EGLenum, \
+        EGLClientBuffer, EGLConfig, const EGLint *)) \
+    FN(eglCreateWindowSurface, EGLSurface (*EGLAPIENTRY \
+        PFN_eglCreateWindowSurface)(EGLDisplay, EGLConfig, \
+        EGLNativeWindowType, const EGLint *)) \
+    FN(eglDestroyContext, EGLBoolean (*EGLAPIENTRY PFN_eglDestroyContext) \
+        (EGLDisplay, EGLContext)) \
+    FN(eglDestroySurface, EGLBoolean (*EGLAPIENTRY PFN_eglDestroySurface) \
+        (EGLDisplay, EGLSurface)) \
+    FN(eglGetConfigAttrib, EGLBoolean (*EGLAPIENTRY PFN_eglGetConfigAttrib) \
+        (EGLDisplay, EGLConfig, EGLint, EGLint *)) \
+    FN(eglGetCurrentContext, EGLContext (*EGLAPIENTRY \
+        PFN_eglGetCurrentContext)(void)) \
+    FN(eglGetCurrentDisplay, EGLDisplay (*EGLAPIENTRY \
+        PFN_eglGetCurrentDisplay)(void)) \
+    FN(eglGetDisplay, EGLDisplay (*EGLAPIENTRY PFN_eglGetDisplay) \
+        (EGLNativeDisplayType)) \
+    FN(eglGetError, EGLint (*EGLAPIENTRY PFN_eglGetError)(void)) \
+    FN(eglGetProcAddress, void *(*EGLAPIENTRY \
+        PFN_eglGetProcAddress)(const char *)) \
+    FN(eglInitialize, EGLBoolean (*EGLAPIENTRY PFN_eglInitialize) \
+        (EGLDisplay, EGLint *, EGLint *)) \
+    FN(eglMakeCurrent, EGLBoolean (*EGLAPIENTRY PFN_eglMakeCurrent) \
+        (EGLDisplay, EGLSurface, EGLSurface, EGLContext)) \
+    FN(eglQueryString, const char *(*EGLAPIENTRY PFN_eglQueryString) \
+        (EGLDisplay, EGLint)) \
+    FN(eglSwapBuffers, EGLBoolean (*EGLAPIENTRY PFN_eglSwapBuffers) \
+        (EGLDisplay, EGLSurface)) \
+    FN(eglReleaseTexImage, EGLBoolean (*EGLAPIENTRY PFN_eglReleaseTexImage) \
+        (EGLDisplay, EGLSurface, EGLint)) \
+    FN(eglTerminate, EGLBoolean (*EGLAPIENTRY PFN_eglTerminate)(EGLDisplay))
+
+#define ANGLE_EXT_DECL(NAME, VAR) \
+    extern VAR;
+ANGLE_FNS(ANGLE_EXT_DECL)
+
+bool angle_load(void);
+
+// Source compatibility to statically linked ANGLE.
+#ifndef ANGLE_NO_ALIASES
+#define eglBindAPI                      PFN_eglBindAPI
+#define eglBindTexImage                 PFN_eglBindTexImage
+#define eglChooseConfig                 PFN_eglChooseConfig
+#define eglCreateContext                PFN_eglCreateContext
+#define eglCreatePbufferFromClientBuffer PFN_eglCreatePbufferFromClientBuffer
+#define eglCreateWindowSurface          PFN_eglCreateWindowSurface
+#define eglDestroyContext               PFN_eglDestroyContext
+#define eglDestroySurface               PFN_eglDestroySurface
+#define eglGetConfigAttrib              PFN_eglGetConfigAttrib
+#define eglGetCurrentContext            PFN_eglGetCurrentContext
+#define eglGetCurrentDisplay            PFN_eglGetCurrentDisplay
+#define eglGetDisplay                   PFN_eglGetDisplay
+#define eglGetError                     PFN_eglGetError
+#define eglGetProcAddress               PFN_eglGetProcAddress
+#define eglInitialize                   PFN_eglInitialize
+#define eglMakeCurrent                  PFN_eglMakeCurrent
+#define eglQueryString                  PFN_eglQueryString
+#define eglReleaseTexImage              PFN_eglReleaseTexImage
+#define eglSwapBuffers                  PFN_eglSwapBuffers
+#define eglTerminate                    PFN_eglTerminate
+#endif
+
+#endif
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
index 46cbc2f..dd44165 100644
--- a/video/out/opengl/common.c
+++ b/video/out/opengl/common.c
@@ -72,6 +72,8 @@ struct gl_functions {
     int provides;               // bitfield of MPGL_CAP_* constants
     int ver_core;               // introduced as required function
     int ver_es_core;            // introduced as required GL ES function
+    int ver_exclude;            // not applicable to versions >= ver_exclude
+    int ver_es_exclude;         // same for GLES
     const struct gl_function *functions;
 };
 
@@ -144,15 +146,23 @@ static const struct gl_functions gl_functions[] = {
         .ver_core = 210,
         .provides = MPGL_CAP_ROW_LENGTH | MPGL_CAP_1D_TEX,
         .functions = (const struct gl_function[]) {
-            DEF_FN(DrawBuffer),
             DEF_FN(GetTexLevelParameteriv),
-            DEF_FN(MapBuffer),
             DEF_FN(ReadBuffer),
             DEF_FN(TexImage1D),
             DEF_FN(UnmapBuffer),
             {0}
         },
     },
+    // GL 2.1 has this as extension only.
+    {
+        .ver_exclude = 300,
+        .ver_es_exclude = 300,
+        .extension = "GL_ARB_map_buffer_range",
+        .functions = (const struct gl_function[]) {
+            DEF_FN(MapBufferRange),
+            {0}
+        },
+    },
     // GL 3.0+ and ES 3.x core only functions.
     {
         .ver_core = 300,
@@ -161,6 +171,7 @@ static const struct gl_functions gl_functions[] = {
             DEF_FN(BindBufferBase),
             DEF_FN(BlitFramebuffer),
             DEF_FN(GetStringi),
+            DEF_FN(MapBufferRange),
             // for ES 3.0
             DEF_FN(ReadBuffer),
             DEF_FN(UnmapBuffer),
@@ -203,6 +214,7 @@ static const struct gl_functions gl_functions[] = {
             DEF_FN(DeleteFramebuffers),
             DEF_FN(CheckFramebufferStatus),
             DEF_FN(FramebufferTexture2D),
+            DEF_FN(GetFramebufferAttachmentParameteriv),
             {0}
         },
     },
@@ -227,6 +239,32 @@ static const struct gl_functions gl_functions[] = {
         .provides = MPGL_CAP_TEX_RG,
     },
     {
+        .ver_core = 300,
+        .ver_es_core = 300,
+        .extension = "GL_EXT_texture_rg",
+        .provides = MPGL_CAP_TEX_RG,
+    },
+    // GL_R16 etc.
+    {
+        .extension = "GL_EXT_texture_norm16",
+        .provides = MPGL_CAP_EXT16,
+        .ver_exclude = 1, // never in desktop GL
+    },
+    // Float texture support for GL 2.x
+    {
+        .extension = "GL_ARB_texture_float",
+        .provides = MPGL_CAP_ARB_FLOAT,
+        .ver_exclude = 300,
+        .ver_es_exclude = 1,
+    },
+    // 16 bit float textures that can be rendered to in GLES
+    {
+        .extension = "GL_EXT_color_buffer_half_float",
+        .provides = MPGL_CAP_EXT_CR_HFLOAT,
+        .ver_exclude = 1,
+        .ver_es_exclude = 320,
+    },
+    {
         .ver_core = 320,
         .extension = "GL_ARB_sync",
         .functions = (const struct gl_function[]) {
@@ -236,6 +274,47 @@ static const struct gl_functions gl_functions[] = {
             {0}
         },
     },
+    {
+        .ver_core = 330,
+        .extension = "GL_ARB_timer_query",
+        .functions = (const struct gl_function[]) {
+            DEF_FN(GenQueries),
+            DEF_FN(DeleteQueries),
+            DEF_FN(BeginQuery),
+            DEF_FN(EndQuery),
+            DEF_FN(QueryCounter),
+            DEF_FN(IsQuery),
+            DEF_FN(GetQueryObjectiv),
+            DEF_FN(GetQueryObjecti64v),
+            DEF_FN(GetQueryObjectuiv),
+            DEF_FN(GetQueryObjectui64v),
+            {0}
+        },
+    },
+    {
+        .extension = "GL_EXT_disjoint_timer_query",
+        .functions = (const struct gl_function[]) {
+            DEF_FN_NAME(GenQueries, "glGenQueriesEXT"),
+            DEF_FN_NAME(DeleteQueries, "glDeleteQueriesEXT"),
+            DEF_FN_NAME(BeginQuery, "glBeginQueryEXT"),
+            DEF_FN_NAME(EndQuery, "glEndQueryEXT"),
+            DEF_FN_NAME(QueryCounter, "glQueryCounterEXT"),
+            DEF_FN_NAME(IsQuery, "glIsQueryEXT"),
+            DEF_FN_NAME(GetQueryObjectiv, "glGetQueryObjectivEXT"),
+            DEF_FN_NAME(GetQueryObjecti64v, "glGetQueryObjecti64vEXT"),
+            DEF_FN_NAME(GetQueryObjectuiv, "glGetQueryObjectuivEXT"),
+            DEF_FN_NAME(GetQueryObjectui64v, "glGetQueryObjectui64vEXT"),
+            {0}
+        },
+    },
+    {
+        .ver_core = 430,
+        .ver_es_core = 300,
+        .functions = (const struct gl_function[]) {
+            DEF_FN(InvalidateFramebuffer),
+            {0}
+        },
+    },
     // Swap control, always an OS specific extension
     // The OSX code loads this manually.
     {
@@ -270,6 +349,7 @@ static const struct gl_functions gl_functions[] = {
             DEF_FN(VDPAUInitNV),
             DEF_FN(VDPAUFiniNV),
             DEF_FN(VDPAURegisterOutputSurfaceNV),
+            DEF_FN(VDPAURegisterVideoSurfaceNV),
             DEF_FN(VDPAUUnregisterSurfaceNV),
             DEF_FN(VDPAUSurfaceAccessNV),
             DEF_FN(VDPAUMapSurfacesNV),
@@ -327,14 +407,10 @@ static const struct gl_functions gl_functions[] = {
             {0}
         },
     },
-    // uniform buffer object extensions, requires OpenGL 3.1.
     {
-        .ver_core = 310,
-        .ver_es_core = 300,
-        .extension = "GL_ARB_uniform_buffer_object",
+        .extension = "GL_ANGLE_translated_shader_source",
         .functions = (const struct gl_function[]) {
-            DEF_FN(GetUniformBlockIndex),
-            DEF_FN(UniformBlockBinding),
+            DEF_FN(GetTranslatedShaderSourceANGLE),
             {0}
         },
     },
@@ -348,11 +424,9 @@ static const struct gl_functions gl_functions[] = {
 
 // Fill the GL struct with function pointers and extensions from the current
 // GL context. Called by the backend.
-// getProcAddress: function to resolve function names, may be NULL
+// get_fn: function to resolve function names
 // ext2: an extra extension string
 // log: used to output messages
-// Note: if you create a CONTEXT_FORWARD_COMPATIBLE_BIT_ARB with OpenGL 3.0,
-//       you must append "GL_ARB_compatibility" to ext2.
 void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
                           void *fn_ctx, const char *ext2, struct mp_log *log)
 {
@@ -428,6 +502,13 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
         // NOTE: Function entrypoints can exist, even if they do not work.
         //       We must always check extension strings and versions.
 
+        if (gl->version && section->ver_exclude &&
+            gl->version >= section->ver_exclude)
+            continue;
+        if (gl->es && section->ver_es_exclude &&
+            gl->es >= section->ver_es_exclude)
+            continue;
+
         bool exists = false, must_exist = false;
         if (ver_core)
             must_exist = version >= ver_core;
@@ -448,13 +529,15 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
             void *ptr = get_fn(fn_ctx, fn->name);
             if (!ptr) {
                 all_loaded = false;
-                mp_warn(log, "Required function '%s' not "
-                        "found for %s OpenGL %d.%d.\n", fn->name,
-                        section->extension ? section->extension : "builtin",
-                        MPGL_VER_GET_MAJOR(ver_core),
-                        MPGL_VER_GET_MINOR(ver_core));
-                if (must_exist)
+                if (must_exist) {
+                    mp_err(log, "GL %d.%d function %s not found.\n",
+                           MPGL_VER_GET_MAJOR(ver_core),
+                           MPGL_VER_GET_MINOR(ver_core), fn->name);
                     goto error;
+                } else {
+                    mp_warn(log, "Function %s from extension %s not found.\n",
+                            fn->name, section->extension);
+                }
                 break;
             }
             assert(i < MAX_FN_COUNT);
@@ -469,8 +552,8 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
                 if (loaded[i])
                     *funcptr = loaded[i];
             }
-            mp_verbose(log, "Loaded functions for %d/%s.\n", ver_core,
-                       section->extension ? section->extension : "builtin");
+            if (!must_exist && section->extension)
+                mp_verbose(log, "Loaded extension %s.\n", section->extension);
         }
     }
 
@@ -494,14 +577,6 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
         mp_verbose(log, "Detected suspected software renderer.\n");
     }
 
-    // Detect 16F textures that work with GL_LINEAR filtering.
-    if ((!gl->es && (gl->version >= 300 || check_ext(gl, "GL_ARB_texture_float"))) ||
-        (gl->es && (gl->version >= 310 || check_ext(gl, "GL_OES_texture_half_float_linear"))))
-    {
-        mp_verbose(log, "Filterable half-float textures supported.\n");
-        gl->mpgl_caps |= MPGL_CAP_FLOAT_TEX;
-    }
-
     // Provided for simpler handling if no framebuffer support is available.
     if (!gl->BindFramebuffer)
         gl->BindFramebuffer = &dummy_glBindFramebuffer;
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index f790dcb..e3ebd66 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -53,7 +53,6 @@ enum {
     MPGL_CAP_ROW_LENGTH         = (1 << 4),     // GL_[UN]PACK_ROW_LENGTH
     MPGL_CAP_FB                 = (1 << 5),
     MPGL_CAP_VAO                = (1 << 6),
-    MPGL_CAP_FLOAT_TEX          = (1 << 9),
     MPGL_CAP_TEX_RG             = (1 << 10),    // GL_ARB_texture_rg / GL 3.x
     MPGL_CAP_VDPAU              = (1 << 11),    // GL_NV_vdpau_interop
     MPGL_CAP_APPLE_RGB_422      = (1 << 12),    // GL_APPLE_rgb_422
@@ -61,6 +60,10 @@ enum {
     MPGL_CAP_3D_TEX             = (1 << 15),
     MPGL_CAP_DEBUG              = (1 << 16),
     MPGL_CAP_DXINTEROP          = (1 << 17),    // WGL_NV_DX_interop
+    MPGL_CAP_EXT16              = (1 << 18),    // GL_EXT_texture_norm16
+    MPGL_CAP_ARB_FLOAT          = (1 << 19),    // GL_ARB_texture_float
+    MPGL_CAP_EXT_CR_HFLOAT      = (1 << 20),    // GL_EXT_color_buffer_half_float
+
     MPGL_CAP_SW                 = (1 << 30),    // indirect or sw renderer
 };
 
@@ -88,7 +91,7 @@ struct GL {
     char *extensions;           // Equivalent to GL_EXTENSIONS
     int mpgl_caps;              // Bitfield of MPGL_CAP_* constants
     bool debug_context;         // use of e.g. GLX_CONTEXT_DEBUG_BIT_ARB
-    int fb_r, fb_g, fb_b;       // frame buffer bit depth (0 if unknown)
+    GLuint main_fb;             // framebuffer to render to (normally 0)
 
     void (GLAPIENTRY *Viewport)(GLint, GLint, GLsizei, GLsizei);
     void (GLAPIENTRY *Clear)(GLbitfield);
@@ -98,7 +101,6 @@ struct GL {
     void (GLAPIENTRY *Enable)(GLenum);
     void (GLAPIENTRY *Disable)(GLenum);
     const GLubyte *(GLAPIENTRY * GetString)(GLenum);
-    void (GLAPIENTRY *DrawBuffer)(GLenum);
     void (GLAPIENTRY *BlendFuncSeparate)(GLenum, GLenum, GLenum, GLenum);
     void (GLAPIENTRY *Flush)(void);
     void (GLAPIENTRY *Finish)(void);
@@ -123,7 +125,8 @@ struct GL {
     void (GLAPIENTRY *DeleteBuffers)(GLsizei, const GLuint *);
     void (GLAPIENTRY *BindBuffer)(GLenum, GLuint);
     void (GLAPIENTRY *BindBufferBase)(GLenum, GLuint, GLuint);
-    GLvoid * (GLAPIENTRY * MapBuffer)(GLenum, GLenum);
+    GLvoid * (GLAPIENTRY *MapBufferRange)(GLenum, GLintptr, GLsizeiptr,
+                                          GLbitfield);
     GLboolean (GLAPIENTRY *UnmapBuffer)(GLenum);
     void (GLAPIENTRY *BufferData)(GLenum, intptr_t, const GLvoid *, GLenum);
     void (GLAPIENTRY *ActiveTexture)(GLenum);
@@ -166,6 +169,8 @@ struct GL {
                                             GLint);
     void (GLAPIENTRY *BlitFramebuffer)(GLint, GLint, GLint, GLint, GLint, GLint,
                                        GLint, GLint, GLbitfield, GLenum);
+    void (GLAPIENTRY *GetFramebufferAttachmentParameteriv)(GLenum, GLenum,
+                                                           GLenum, GLint *);
 
     void (GLAPIENTRY *Uniform1f)(GLint, GLfloat);
     void (GLAPIENTRY *Uniform2f)(GLint, GLfloat, GLfloat);
@@ -177,14 +182,29 @@ struct GL {
     void (GLAPIENTRY *UniformMatrix3fv)(GLint, GLsizei, GLboolean,
                                         const GLfloat *);
 
+    void (GLAPIENTRY *InvalidateFramebuffer)(GLenum, GLsizei, const GLenum *);
+
     GLsync (GLAPIENTRY *FenceSync)(GLenum, GLbitfield);
     GLenum (GLAPIENTRY *ClientWaitSync)(GLsync, GLbitfield, GLuint64);
     void (GLAPIENTRY *DeleteSync)(GLsync sync);
 
+    void (GLAPIENTRY *GenQueries)(GLsizei, GLuint *);
+    void (GLAPIENTRY *DeleteQueries)(GLsizei, const GLuint *);
+    void (GLAPIENTRY *BeginQuery)(GLenum,  GLuint);
+    void (GLAPIENTRY *EndQuery)(GLenum);
+    void (GLAPIENTRY *QueryCounter)(GLuint, GLenum);
+    GLboolean (GLAPIENTRY *IsQuery)(GLuint);
+    void (GLAPIENTRY *GetQueryObjectiv)(GLuint, GLenum, GLint *);
+    void (GLAPIENTRY *GetQueryObjecti64v)(GLuint, GLenum, GLint64 *);
+    void (GLAPIENTRY *GetQueryObjectuiv)(GLuint, GLenum, GLuint *);
+    void (GLAPIENTRY *GetQueryObjectui64v)(GLuint, GLenum, GLuint64 *);
+
     void (GLAPIENTRY *VDPAUInitNV)(const GLvoid *, const GLvoid *);
     void (GLAPIENTRY *VDPAUFiniNV)(void);
     GLvdpauSurfaceNV (GLAPIENTRY *VDPAURegisterOutputSurfaceNV)
         (GLvoid *, GLenum, GLsizei, const GLuint *);
+    GLvdpauSurfaceNV (GLAPIENTRY *VDPAURegisterVideoSurfaceNV)
+        (GLvoid *, GLenum, GLsizei, const GLuint *);
     void (GLAPIENTRY *VDPAUUnregisterSurfaceNV)(GLvdpauSurfaceNV);
     void (GLAPIENTRY *VDPAUSurfaceAccessNV)(GLvdpauSurfaceNV, GLenum);
     void (GLAPIENTRY *VDPAUMapSurfacesNV)(GLsizei, const GLvdpauSurfaceNV *);
@@ -208,8 +228,8 @@ struct GL {
     GLint (GLAPIENTRY *GetVideoSync)(GLuint *);
     GLint (GLAPIENTRY *WaitVideoSync)(GLint, GLint, unsigned int *);
 
-    GLuint (GLAPIENTRY *GetUniformBlockIndex)(GLuint, const GLchar *);
-    void (GLAPIENTRY *UniformBlockBinding)(GLuint, GLuint, GLuint);
+    void (GLAPIENTRY *GetTranslatedShaderSourceANGLE)(GLuint, GLsizei,
+                                                      GLsizei*, GLchar* source);
 
     void (GLAPIENTRY *DebugMessageCallback)(MP_GLDEBUGPROC callback,
                                             const void *userParam);
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index 77e9709..186211d 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -42,6 +42,7 @@ extern const struct mpgl_driver mpgl_driver_cocoa;
 extern const struct mpgl_driver mpgl_driver_wayland;
 extern const struct mpgl_driver mpgl_driver_w32;
 extern const struct mpgl_driver mpgl_driver_angle;
+extern const struct mpgl_driver mpgl_driver_angle_es2;
 extern const struct mpgl_driver mpgl_driver_dxinterop;
 extern const struct mpgl_driver mpgl_driver_rpi;
 
@@ -54,6 +55,7 @@ static const struct mpgl_driver *const backends[] = {
 #endif
 #if HAVE_EGL_ANGLE
     &mpgl_driver_angle,
+    &mpgl_driver_angle_es2,
 #endif
 #if HAVE_GL_WIN32
     &mpgl_driver_w32,
diff --git a/video/out/opengl/context_angle.c b/video/out/opengl/context_angle.c
index b922ce8..cc14fc3 100644
--- a/video/out/opengl/context_angle.c
+++ b/video/out/opengl/context_angle.c
@@ -18,15 +18,26 @@
 #include <windows.h>
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
+#include <d3d11.h>
+#include <dxgi.h>
+
+#include "angle_dynamic.h"
 
 #include "common/common.h"
 #include "video/out/w32_common.h"
 #include "context.h"
 
+#ifndef EGL_OPTIMAL_SURFACE_ORIENTATION_ANGLE
+#define EGL_OPTIMAL_SURFACE_ORIENTATION_ANGLE 0x33A7
+#define EGL_SURFACE_ORIENTATION_ANGLE 0x33A8
+#define EGL_SURFACE_ORIENTATION_INVERT_Y_ANGLE 0x0002
+#endif
+
 struct priv {
     EGLDisplay egl_display;
     EGLContext egl_context;
     EGLSurface egl_surface;
+    bool use_es2;
 };
 
 static void angle_uninit(MPGLContext *ctx)
@@ -39,6 +50,8 @@ static void angle_uninit(MPGLContext *ctx)
         eglDestroyContext(p->egl_display, p->egl_context);
     }
     p->egl_context = EGL_NO_CONTEXT;
+    if (p->egl_display)
+        eglTerminate(p->egl_display);
     vo_w32_uninit(ctx->vo);
 }
 
@@ -90,6 +103,74 @@ static bool create_context_egl(MPGLContext *ctx, EGLConfig config, int version)
     return true;
 }
 
+static void d3d_init(struct MPGLContext *ctx)
+{
+    HRESULT hr;
+    struct priv *p = ctx->priv;
+    struct vo *vo = ctx->vo;
+    IDXGIDevice *dxgi_dev = NULL;
+    IDXGIAdapter *dxgi_adapter = NULL;
+    IDXGIFactory *dxgi_factory = NULL;
+
+    PFNEGLQUERYDISPLAYATTRIBEXTPROC eglQueryDisplayAttribEXT =
+        (PFNEGLQUERYDISPLAYATTRIBEXTPROC)eglGetProcAddress("eglQueryDisplayAttribEXT");
+    PFNEGLQUERYDEVICEATTRIBEXTPROC eglQueryDeviceAttribEXT =
+        (PFNEGLQUERYDEVICEATTRIBEXTPROC)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!eglQueryDisplayAttribEXT || !eglQueryDeviceAttribEXT) {
+        MP_VERBOSE(vo, "Missing EGL_EXT_device_query\n");
+        goto done;
+    }
+
+    EGLAttrib dev_attr;
+    if (!eglQueryDisplayAttribEXT(p->egl_display, EGL_DEVICE_EXT, &dev_attr)) {
+        MP_VERBOSE(vo, "Missing EGL_EXT_device_query\n");
+        goto done;
+    }
+
+    // If ANGLE is in D3D11 mode, get the underlying ID3D11Device
+    EGLDeviceEXT dev = (EGLDeviceEXT)dev_attr;
+    EGLAttrib d3d11_dev_attr;
+    if (eglQueryDeviceAttribEXT(dev, EGL_D3D11_DEVICE_ANGLE, &d3d11_dev_attr)) {
+        ID3D11Device *d3d11_dev = (ID3D11Device*)d3d11_dev_attr;
+
+        hr = ID3D11Device_QueryInterface(d3d11_dev, &IID_IDXGIDevice,
+            (void**)&dxgi_dev);
+        if (FAILED(hr)) {
+            MP_ERR(vo, "Device is not a IDXGIDevice\n");
+            goto done;
+        }
+
+        hr = IDXGIDevice_GetAdapter(dxgi_dev, &dxgi_adapter);
+        if (FAILED(hr)) {
+            MP_ERR(vo, "Couldn't get IDXGIAdapter\n");
+            goto done;
+        }
+
+        hr = IDXGIAdapter_GetParent(dxgi_adapter, &IID_IDXGIFactory,
+            (void**)&dxgi_factory);
+        if (FAILED(hr)) {
+            MP_ERR(vo, "Couldn't get IDXGIFactory\n");
+            goto done;
+        }
+
+        // Prevent DXGI from making changes to the VO window, otherwise in
+        // non-DirectComposition mode it will hook the Alt+Enter keystroke and
+        // make it trigger an ugly transition to exclusive fullscreen mode
+        // instead of running the user-set command.
+        IDXGIFactory_MakeWindowAssociation(dxgi_factory, vo_w32_hwnd(vo),
+            DXGI_MWA_NO_WINDOW_CHANGES | DXGI_MWA_NO_ALT_ENTER |
+            DXGI_MWA_NO_PRINT_SCREEN);
+    }
+
+done:
+    if (dxgi_dev)
+        IDXGIDevice_Release(dxgi_dev);
+    if (dxgi_adapter)
+        IDXGIAdapter_Release(dxgi_adapter);
+    if (dxgi_factory)
+        IDXGIFactory_Release(dxgi_factory);
+}
+
 static void *get_proc_address(const GLubyte *proc_name)
 {
     return eglGetProcAddress(proc_name);
@@ -100,6 +181,11 @@ static int angle_init(struct MPGLContext *ctx, int flags)
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
+    if (!angle_load()) {
+        MP_VERBOSE(vo, "Failed to load LIBEGL.DLL\n");
+        goto fail;
+    }
+
     if (!vo_w32_init(vo))
         goto fail;
 
@@ -142,6 +228,10 @@ static int angle_init(struct MPGLContext *ctx, int flags)
         goto fail;
     }
 
+    const char *exts = eglQueryString(p->egl_display, EGL_EXTENSIONS);
+    if (exts)
+        MP_DBG(ctx->vo, "EGL extensions: %s\n", exts);
+
     eglBindAPI(EGL_OPENGL_ES_API);
     if (eglGetError() != EGL_SUCCESS) {
         MP_FATAL(vo, "Couldn't bind GLES API\n");
@@ -152,22 +242,53 @@ static int angle_init(struct MPGLContext *ctx, int flags)
     if (!config)
         goto fail;
 
+    int window_attribs_len = 0;
+    EGLint *window_attribs = NULL;
+
+    EGLint flip_val;
+    if (eglGetConfigAttrib(p->egl_display, config,
+                           EGL_OPTIMAL_SURFACE_ORIENTATION_ANGLE, &flip_val))
+    {
+        if (flip_val == EGL_SURFACE_ORIENTATION_INVERT_Y_ANGLE) {
+            MP_TARRAY_APPEND(NULL, window_attribs, window_attribs_len,
+                EGL_SURFACE_ORIENTATION_ANGLE);
+            MP_TARRAY_APPEND(NULL, window_attribs, window_attribs_len,
+                EGL_SURFACE_ORIENTATION_INVERT_Y_ANGLE);
+            ctx->flip_v = true;
+            MP_VERBOSE(vo, "Rendering flipped.\n");
+        }
+    }
+
+    // EGL_DIRECT_COMPOSITION_ANGLE enables the use of flip-mode present, which
+    // avoids a copy of the video image and lowers vsync jitter, though the
+    // extension is only present on Windows 8 and up.
+    if (strstr(exts, "EGL_ANGLE_direct_composition")) {
+        MP_TARRAY_APPEND(NULL, window_attribs, window_attribs_len,
+            EGL_DIRECT_COMPOSITION_ANGLE);
+        MP_TARRAY_APPEND(NULL, window_attribs, window_attribs_len, EGL_TRUE);
+        MP_VERBOSE(vo, "Using DirectComposition.\n");
+    }
+
+    MP_TARRAY_APPEND(NULL, window_attribs, window_attribs_len, EGL_NONE);
     p->egl_surface = eglCreateWindowSurface(p->egl_display, config,
-                                            vo_w32_hwnd(vo), NULL);
+                                            vo_w32_hwnd(vo), window_attribs);
+    talloc_free(window_attribs);
     if (p->egl_surface == EGL_NO_SURFACE) {
         MP_FATAL(ctx->vo, "Could not create EGL surface!\n");
         goto fail;
     }
 
-    if (!create_context_egl(ctx, config, 3) &&
+    if (!(!p->use_es2 && create_context_egl(ctx, config, 3)) &&
         !create_context_egl(ctx, config, 2))
     {
         MP_FATAL(ctx->vo, "Could not create EGL context!\n");
         goto fail;
     }
 
-    mpgl_load_functions(ctx->gl, get_proc_address, NULL, vo->log);
+    // Configure the underlying Direct3D device
+    d3d_init(ctx);
 
+    mpgl_load_functions(ctx->gl, get_proc_address, NULL, vo->log);
     return 0;
 
 fail:
@@ -175,6 +296,17 @@ fail:
     return -1;
 }
 
+static int angle_init_es2(struct MPGLContext *ctx, int flags)
+{
+    struct priv *p = ctx->priv;
+    p->use_es2 = true;
+    if (ctx->vo->probing) {
+        MP_VERBOSE(ctx->vo, "Not using this by default.\n");
+        return -1;
+    }
+    return angle_init(ctx, flags);
+}
+
 static int angle_reconfig(struct MPGLContext *ctx)
 {
     vo_w32_config(ctx->vo);
@@ -201,3 +333,13 @@ const struct mpgl_driver mpgl_driver_angle = {
     .control        = angle_control,
     .uninit         = angle_uninit,
 };
+
+const struct mpgl_driver mpgl_driver_angle_es2 = {
+    .name           = "angle-es2",
+    .priv_size      = sizeof(struct priv),
+    .init           = angle_init_es2,
+    .reconfig       = angle_reconfig,
+    .swap_buffers   = angle_swap_buffers,
+    .control        = angle_control,
+    .uninit         = angle_uninit,
+};
diff --git a/video/out/opengl/context_cocoa.c b/video/out/opengl/context_cocoa.c
index 271bdb7..ea7a9b5 100644
--- a/video/out/opengl/context_cocoa.c
+++ b/video/out/opengl/context_cocoa.c
@@ -33,14 +33,6 @@ static int set_swap_interval(int enabled)
     return (err == kCGLNoError) ? 0 : -1;
 }
 
-static int cgl_color_size(struct MPGLContext *ctx)
-{
-    struct cgl_context *p = ctx->priv;
-    GLint value;
-    CGLDescribePixelFormat(p->pix, 0, kCGLPFAColorSize, &value);
-    return value > 16 ? 8 : 5;
-}
-
 static void *cocoa_glgetaddr(const char *s)
 {
     void *ret = NULL;
@@ -123,7 +115,6 @@ static bool create_gl_context(struct MPGLContext *ctx, int vo_flags)
         CGLSetParameter(p->ctx, kCGLCPSurfaceOpacity, &(GLint){0});
 
     mpgl_load_functions(ctx->gl, (void *)cocoa_glgetaddr, NULL, ctx->vo->log);
-    ctx->gl->fb_r = ctx->gl->fb_g = ctx->gl->fb_b = cgl_color_size(ctx);
 
     CGLReleasePixelFormat(p->pix);
 
diff --git a/video/out/opengl/context_dxinterop.c b/video/out/opengl/context_dxinterop.c
index 4dfc3c2..95b9296 100644
--- a/video/out/opengl/context_dxinterop.c
+++ b/video/out/opengl/context_dxinterop.c
@@ -27,6 +27,9 @@
 // For WGL_ACCESS_WRITE_DISCARD_NV, etc.
 #include <GL/wglext.h>
 
+EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+#define HINST_THISCOMPONENT ((HINSTANCE)&__ImageBase)
+
 // mingw-w64 header typo?
 #ifndef IDirect3DSwapChain9Ex_GetBackBuffer
 #define IDirect3DSwapChain9Ex_GetBackBuffer IDirect3DSwapChain9EX_GetBackBuffer
@@ -51,21 +54,14 @@ struct priv {
     HGLRC os_ctx;
 
     // OpenGL resources
-    GLuint framebuffer;
     GLuint texture;
 
-    // Is the shared framebuffer currently bound?
-    bool fb_bound;
-    // Is the shared texture currently attached?
-    bool tex_attached;
     // Did we lose the device?
     bool lost_device;
 
     // Requested and current parameters
     int requested_swapinterval;
     int width, height, swapinterval;
-
-    void (GLAPIENTRY *real_gl_bind_framebuffer)(GLenum, GLuint);
 };
 
 static __thread struct MPGLContext *current_ctx;
@@ -99,7 +95,7 @@ static int os_ctx_create(struct MPGLContext *ctx)
         .cbSize = sizeof(WNDCLASSEXW),
         .style = CS_OWNDC,
         .lpfnWndProc = DefWindowProc,
-        .hInstance = GetModuleHandleW(NULL),
+        .hInstance = HINST_THISCOMPONENT,
         .lpszClassName = os_wnd_class,
     });
 
@@ -107,7 +103,7 @@ static int os_ctx_create(struct MPGLContext *ctx)
     // possible to use the VO window, but MSDN recommends against drawing to
     // the same window with flip mode present and other APIs, so play it safe.
     p->os_wnd = CreateWindowExW(0, os_wnd_class, os_wnd_class, 0, 0, 0, 200,
-        200, NULL, NULL, GetModuleHandleW(NULL), NULL);
+        200, NULL, NULL, HINST_THISCOMPONENT, NULL);
     p->os_dc = GetDC(p->os_wnd);
     if (!p->os_dc) {
         MP_FATAL(ctx->vo, "Couldn't create window for offscreen rendering\n");
@@ -224,18 +220,6 @@ static void os_ctx_destroy(MPGLContext *ctx)
         DestroyWindow(p->os_wnd);
 }
 
-static void try_attach_texture(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
-
-    if (p->fb_bound && !p->tex_attached) {
-        gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
-            GL_TEXTURE_2D, p->texture, 0);
-        p->tex_attached = true;
-    }
-}
-
 static int d3d_size_dependent_create(MPGLContext *ctx)
 {
     struct priv *p = ctx->priv;
@@ -275,25 +259,6 @@ static int d3d_size_dependent_create(MPGLContext *ctx)
     MP_VERBOSE(ctx->vo, "DX_interop backbuffer format: %u\n",
         (unsigned)bb_desc.Format);
 
-    // Note: This backend has only been tested on an 8-bit display. It's
-    // unknown whether this code is enough to support other formats or if more
-    // work is needed.
-    switch (bb_desc.Format) {
-    case D3DFMT_X1R5G5B5: case D3DFMT_A1R5G5B5:
-        ctx->gl->fb_r = ctx->gl->fb_g = ctx->gl->fb_b = 5;
-        break;
-    case D3DFMT_R5G6B5:
-        ctx->gl->fb_r = 5; ctx->gl->fb_g = 6; ctx->gl->fb_b = 5;
-        break;
-    case D3DFMT_R8G8B8: case D3DFMT_A8R8G8B8: case D3DFMT_X8R8G8B8:
-    case D3DFMT_A8B8G8R8: case D3DFMT_X8B8G8R8: default:
-        ctx->gl->fb_r = ctx->gl->fb_g = ctx->gl->fb_b = 8;
-        break;
-    case D3DFMT_A2R10G10B10: case D3DFMT_A2B10G10R10:
-        ctx->gl->fb_r = ctx->gl->fb_g = ctx->gl->fb_b = 10;
-        break;
-    }
-
     // Create a rendertarget with the same format as the backbuffer for
     // rendering from OpenGL
     HANDLE share_handle = NULL;
@@ -312,7 +277,6 @@ static int d3d_size_dependent_create(MPGLContext *ctx)
 
     // Create the OpenGL-side texture
     gl->GenTextures(1, &p->texture);
-    p->tex_attached = false;
 
     // Now share the rendertarget with OpenGL as a texture
     p->rtarget_h = gl->DXRegisterObjectNV(p->device_h, p->rtarget, p->texture,
@@ -331,9 +295,10 @@ static int d3d_size_dependent_create(MPGLContext *ctx)
         return -1;
     }
 
-    // Only attach the shared texture if the shared framebuffer is bound. If
-    // it's not, the texture will be attached when glBindFramebuffer is called.
-    try_attach_texture(ctx);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, gl->main_fb);
+    gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
+        GL_TEXTURE_2D, p->texture, 0);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
 
     return 0;
 }
@@ -476,27 +441,6 @@ static void dxinterop_uninit(MPGLContext *ctx)
     pump_message_loop();
 }
 
-static GLAPIENTRY void dxinterop_bind_framebuffer(GLenum target,
-    GLuint framebuffer)
-{
-    if (!current_ctx)
-        return;
-    struct priv *p = current_ctx->priv;
-
-    // Keep track of whether the shared framebuffer is bound
-    if (target == GL_FRAMEBUFFER || target == GL_DRAW_FRAMEBUFFER)
-        p->fb_bound = (framebuffer == 0);
-
-    // Pretend the shared framebuffer is the primary framebuffer
-    if (framebuffer == 0)
-        framebuffer = p->framebuffer;
-
-    p->real_gl_bind_framebuffer(target, framebuffer);
-
-    // Attach the shared texture if it is not attached already
-    try_attach_texture(current_ctx);
-}
-
 static void dxinterop_reset(struct MPGLContext *ctx)
 {
     struct priv *p = ctx->priv;
@@ -570,16 +514,10 @@ static int dxinterop_init(struct MPGLContext *ctx, int flags)
         goto fail;
 
     // Create the shared framebuffer
-    gl->GenFramebuffers(1, &p->framebuffer);
+    gl->GenFramebuffers(1, &gl->main_fb);
 
-    // Hook glBindFramebuffer to return the shared framebuffer instead of the
-    // primary one
     current_ctx = ctx;
-    p->real_gl_bind_framebuffer = gl->BindFramebuffer;
-    gl->BindFramebuffer = dxinterop_bind_framebuffer;
-
     gl->SwapInterval = dxinterop_swap_interval;
-
     gl->MPGetNativeDisplay = dxinterop_get_native_display;
 
     if (d3d_create(ctx) < 0)
@@ -587,9 +525,6 @@ static int dxinterop_init(struct MPGLContext *ctx, int flags)
     if (d3d_size_dependent_create(ctx) < 0)
         goto fail;
 
-    // Bind the shared framebuffer. This will also attach the shared texture.
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-
     // The OpenGL and Direct3D coordinate systems are flipped vertically
     // relative to each other. Flip the video during rendering so it can be
     // copied to the Direct3D backbuffer with a simple (and fast) StretchRect.
diff --git a/video/out/opengl/context_rpi.c b/video/out/opengl/context_rpi.c
index c01c173..c0ca733 100644
--- a/video/out/opengl/context_rpi.c
+++ b/video/out/opengl/context_rpi.c
@@ -19,7 +19,6 @@
 #include <assert.h>
 
 #include "common/common.h"
-#include "video/out/x11_common.h"
 #include "context.h"
 
 #include "context_rpi.h"
diff --git a/video/out/opengl/context_w32.c b/video/out/opengl/context_w32.c
index c647d97..3a0118e 100644
--- a/video/out/opengl/context_w32.c
+++ b/video/out/opengl/context_w32.c
@@ -209,14 +209,6 @@ static void create_ctx(void *ptr)
     if (!w32_ctx->context)
         create_context_w32_old(ctx);
 
-    int pfmt = GetPixelFormat(w32_ctx->hdc);
-    PIXELFORMATDESCRIPTOR pfd;
-    if (DescribePixelFormat(w32_ctx->hdc, pfmt, sizeof(pfd), &pfd)) {
-        ctx->gl->fb_r = pfd.cRedBits;
-        ctx->gl->fb_g = pfd.cGreenBits;
-        ctx->gl->fb_b = pfd.cBlueBits;
-    }
-
     wglMakeCurrent(w32_ctx->hdc, NULL);
 }
 
diff --git a/video/out/opengl/context_wayland.c b/video/out/opengl/context_wayland.c
index a100073..e74132b 100644
--- a/video/out/opengl/context_wayland.c
+++ b/video/out/opengl/context_wayland.c
@@ -25,10 +25,14 @@ static void egl_resize(struct vo_wayland_state *wl)
     int32_t y = wl->window.sh_y;
     int32_t width = wl->window.sh_width;
     int32_t height = wl->window.sh_height;
+    int32_t scale = 1;
 
     if (!wl->egl_context.egl_window)
         return;
 
+    if (wl->display.current_output)
+        scale = wl->display.current_output->scale;
+
     // get the real size of the window
     // this improves moving the window while resizing it
     wl_egl_window_get_attached_size(wl->egl_context.egl_window,
@@ -46,14 +50,15 @@ static void egl_resize(struct vo_wayland_state *wl)
     if (y != 0)
         y = wl->window.height - height;
 
-    wl_egl_window_resize(wl->egl_context.egl_window, width, height, x, y);
+    wl_surface_set_buffer_scale(wl->window.video_surface, scale);
+    wl_egl_window_resize(wl->egl_context.egl_window, scale*width, scale*height, x, y);
 
     wl->window.width = width;
     wl->window.height = height;
 
     /* set size for mplayer */
-    wl->vo->dwidth = wl->window.width;
-    wl->vo->dheight = wl->window.height;
+    wl->vo->dwidth  = scale*wl->window.width;
+    wl->vo->dheight = scale*wl->window.height;
 
     wl->vo->want_redraw = true;
     wl->window.events = 0;
diff --git a/video/out/opengl/context_x11.c b/video/out/opengl/context_x11.c
index d9a584e..11700ef 100644
--- a/video/out/opengl/context_x11.c
+++ b/video/out/opengl/context_x11.c
@@ -271,10 +271,6 @@ static int glx_init(struct MPGLContext *ctx, int flags)
     if (!success)
         goto uninit;
 
-    glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_RED_SIZE, &ctx->gl->fb_r);
-    glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_GREEN_SIZE, &ctx->gl->fb_g);
-    glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_BLUE_SIZE, &ctx->gl->fb_b);
-
     return 0;
 
 uninit:
diff --git a/video/out/opengl/context_x11egl.c b/video/out/opengl/context_x11egl.c
index e6069b7..2e4fd5f 100644
--- a/video/out/opengl/context_x11egl.c
+++ b/video/out/opengl/context_x11egl.c
@@ -21,6 +21,11 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
+#ifndef EGL_VERSION_1_5
+#define EGL_CONTEXT_OPENGL_PROFILE_MASK         0x30FD
+#define EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT     0x00000001
+#endif
+
 #include "common/common.h"
 #include "video/out/x11_common.h"
 #include "context.h"
@@ -79,9 +84,15 @@ static bool create_context_egl(MPGLContext *ctx, EGLConfig config,
     EGLint context_attributes[] = {
         // aka EGL_CONTEXT_MAJOR_VERSION_KHR
         EGL_CONTEXT_CLIENT_VERSION, es ? 2 : 3,
+        EGL_NONE, EGL_NONE,
         EGL_NONE
     };
 
+    if (!es) {
+        context_attributes[2] = EGL_CONTEXT_OPENGL_PROFILE_MASK;
+        context_attributes[3] = EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT;
+    }
+
     p->egl_surface = eglCreateWindowSurface(p->egl_display, config, window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
@@ -152,7 +163,6 @@ static int mpegl_init(struct MPGLContext *ctx, int flags)
 
     void *(*gpa)(const GLubyte*) = (void *(*)(const GLubyte*))eglGetProcAddress;
     mpgl_load_functions(ctx->gl, gpa, egl_exts, vo->log);
-    mp_egl_get_depth(ctx->gl, config);
 
     ctx->native_display_type = "x11";
     ctx->native_display = vo->x11->display;
diff --git a/video/out/opengl/egl_helpers.c b/video/out/opengl/egl_helpers.c
index d86b5be..7e236f1 100644
--- a/video/out/opengl/egl_helpers.c
+++ b/video/out/opengl/egl_helpers.c
@@ -18,13 +18,3 @@
 #include "egl_helpers.h"
 #include "common.h"
 
-void mp_egl_get_depth(struct GL *gl, EGLConfig fbc)
-{
-    EGLint tokens[] = {EGL_RED_SIZE, EGL_GREEN_SIZE, EGL_BLUE_SIZE};
-    int *ptrs[] =     {&gl->fb_r,    &gl->fb_g,      &gl->fb_b};
-    for (int n = 0; n < MP_ARRAY_SIZE(tokens); n++) {
-        EGLint depth = 0;
-        if (eglGetConfigAttrib(eglGetCurrentDisplay(), fbc, tokens[n], &depth))
-            *ptrs[n] = depth;
-    }
-}
diff --git a/video/out/opengl/egl_helpers.h b/video/out/opengl/egl_helpers.h
index f9961fe..3806ef1 100644
--- a/video/out/opengl/egl_helpers.h
+++ b/video/out/opengl/egl_helpers.h
@@ -4,7 +4,4 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-struct GL;
-void mp_egl_get_depth(struct GL *gl, EGLConfig fbc);
-
 #endif
diff --git a/video/out/opengl/formats.c b/video/out/opengl/formats.c
new file mode 100644
index 0000000..2e3dad0
--- /dev/null
+++ b/video/out/opengl/formats.c
@@ -0,0 +1,272 @@
+#include "common/common.h"
+#include "formats.h"
+
+enum {
+    // --- GL type aliases (for readability)
+    T_U8        = GL_UNSIGNED_BYTE,
+    T_U16       = GL_UNSIGNED_SHORT,
+    T_FL        = GL_FLOAT,
+};
+
+// List of allowed formats, and their usability for bilinear filtering and FBOs.
+// This is limited to combinations that are useful for our renderer.
+const struct gl_format gl_formats[] = {
+    // These are used for desktop GL 3+, and GLES 3+ with GL_EXT_texture_norm16.
+    {GL_R8,                  GL_RED,             T_U8,  F_CF | F_GL3 | F_GL2F | F_ES3},
+    {GL_RG8,                 GL_RG,              T_U8,  F_CF | F_GL3 | F_GL2F | F_ES3},
+    {GL_RGB8,                GL_RGB,             T_U8,  F_CF | F_GL3 | F_GL2F | F_ES3},
+    {GL_RGBA8,               GL_RGBA,            T_U8,  F_CF | F_GL3 | F_GL2F | F_ES3},
+    {GL_R16,                 GL_RED,             T_U16, F_CF | F_GL3 | F_GL2F | F_EXT16},
+    {GL_RG16,                GL_RG,              T_U16, F_CF | F_GL3 | F_GL2F | F_EXT16},
+    {GL_RGB16,               GL_RGB,             T_U16, F_CF | F_GL3 | F_GL2F},
+    {GL_RGBA16,              GL_RGBA,            T_U16, F_CF | F_GL3 | F_GL2F | F_EXT16},
+
+    // Specifically not color-renderable.
+    {GL_RGB16,               GL_RGB,             T_U16, F_TF | F_EXT16},
+
+    // GL2 legacy. Ignores possibly present FBO extensions (no CF flag set).
+    {GL_LUMINANCE8,          GL_LUMINANCE,       T_U8,  F_TF | F_GL2},
+    {GL_LUMINANCE8_ALPHA8,   GL_LUMINANCE_ALPHA, T_U8,  F_TF | F_GL2},
+    {GL_RGB8,                GL_RGB,             T_U8,  F_TF | F_GL2},
+    {GL_RGBA8,               GL_RGBA,            T_U8,  F_TF | F_GL2},
+    {GL_LUMINANCE16,         GL_LUMINANCE,       T_U16, F_TF | F_GL2},
+    {GL_LUMINANCE16_ALPHA16, GL_LUMINANCE_ALPHA, T_U16, F_TF | F_GL2},
+    {GL_RGB16,               GL_RGB,             T_U16, F_TF | F_GL2},
+    {GL_RGBA16,              GL_RGBA,            T_U16, F_TF | F_GL2},
+
+    // ES2 legacy
+    {GL_LUMINANCE,           GL_LUMINANCE,       T_U8,  F_TF | F_ES2},
+    {GL_LUMINANCE_ALPHA,     GL_LUMINANCE_ALPHA, T_U8,  F_TF | F_ES2},
+    {GL_RGB,                 GL_RGB,             T_U8,  F_TF | F_ES2},
+    {GL_RGBA,                GL_RGBA,            T_U8,  F_TF | F_ES2},
+
+    // Non-normalized integer formats.
+    // Follows ES 3.0 as to which are color-renderable.
+    {GL_R8UI,                GL_RED_INTEGER,     T_U8,  F_CR | F_GL3 | F_ES3},
+    {GL_RG8UI,               GL_RG_INTEGER,      T_U8,  F_CR | F_GL3 | F_ES3},
+    {GL_RGB8UI,              GL_RGB_INTEGER,     T_U8,         F_GL3 | F_ES3},
+    {GL_RGBA8UI,             GL_RGBA_INTEGER,    T_U8,  F_CR | F_GL3 | F_ES3},
+    {GL_R16UI,               GL_RED_INTEGER,     T_U16, F_CR | F_GL3 | F_ES3},
+    {GL_RG16UI,              GL_RG_INTEGER,      T_U16, F_CR | F_GL3 | F_ES3},
+    {GL_RGB16UI,             GL_RGB_INTEGER,     T_U16,        F_GL3 | F_ES3},
+    {GL_RGBA16UI,            GL_RGBA_INTEGER,    T_U16, F_CR | F_GL3 | F_ES3},
+
+    // On GL3+ or GL2.1 with GL_ARB_texture_float, floats work fully.
+    {GL_R16F,                GL_RED,             T_FL,  F_F16 | F_CF | F_GL3 | F_GL2F},
+    {GL_RG16F,               GL_RG,              T_FL,  F_F16 | F_CF | F_GL3 | F_GL2F},
+    {GL_RGB16F,              GL_RGB,             T_FL,  F_F16 | F_CF | F_GL3 | F_GL2F},
+    {GL_RGBA16F,             GL_RGBA,            T_FL,  F_F16 | F_CF | F_GL3 | F_GL2F},
+    {GL_R32F,                GL_RED,             T_FL,          F_CF | F_GL3 | F_GL2F},
+    {GL_RG32F,               GL_RG,              T_FL,          F_CF | F_GL3 | F_GL2F},
+    {GL_RGB32F,              GL_RGB,             T_FL,          F_CF | F_GL3 | F_GL2F},
+    {GL_RGBA32F,             GL_RGBA,            T_FL,          F_CF | F_GL3 | F_GL2F},
+
+    // Note: we simply don't support float anything on ES2, despite extensions.
+    // We also don't bother with non-filterable float formats, and we ignore
+    // 32 bit float formats that are not blendable when rendering to them.
+
+    // On ES3.2+, both 16 bit floats work fully (except 3-component formats).
+    // F_EXTF16 implies extensions that also enable 16 bit floats fully.
+    {GL_R16F,                GL_RED,             T_FL,  F_F16 | F_CF | F_ES32 | F_EXTF16},
+    {GL_RG16F,               GL_RG,              T_FL,  F_F16 | F_CF | F_ES32 | F_EXTF16},
+    {GL_RGB16F,              GL_RGB,             T_FL,  F_F16 | F_TF | F_ES32 | F_EXTF16},
+    {GL_RGBA16F,             GL_RGBA,            T_FL,  F_F16 | F_CF | F_ES32 | F_EXTF16},
+
+    // On ES3.0+, 16 bit floats are texture-filterable.
+    // Don't bother with 32 bit floats; they exist but are neither CR nor TF.
+    {GL_R16F,                GL_RED,             T_FL,  F_F16 | F_TF | F_ES3},
+    {GL_RG16F,               GL_RG,              T_FL,  F_F16 | F_TF | F_ES3},
+    {GL_RGB16F,              GL_RGB,             T_FL,  F_F16 | F_TF | F_ES3},
+    {GL_RGBA16F,             GL_RGBA,            T_FL,  F_F16 | F_TF | F_ES3},
+
+    // These might be useful as FBO formats.
+    {GL_RGB10_A2,            GL_RGBA,
+     GL_UNSIGNED_INT_2_10_10_10_REV,                    F_CF | F_GL3 | F_ES3},
+    {GL_RGBA12,              GL_RGBA,            T_U16, F_CF | F_GL2 | F_GL3},
+    {GL_RGB10,               GL_RGB,             T_U16, F_CF | F_GL2 | F_GL3},
+
+    // Special formats.
+    {GL_RGB8,                GL_RGB,
+     GL_UNSIGNED_SHORT_5_6_5,                           F_TF | F_GL2 | F_GL3},
+    {GL_RGB_RAW_422_APPLE,   GL_RGB_422_APPLE,
+     GL_UNSIGNED_SHORT_8_8_APPLE,                       F_TF | F_APPL},
+    {GL_RGB_RAW_422_APPLE,   GL_RGB_422_APPLE,
+     GL_UNSIGNED_SHORT_8_8_REV_APPLE,                   F_TF | F_APPL},
+
+    {0}
+};
+
+// Pairs of mpv formats and OpenGL types that match directly. Code using this
+// is supposed to look through the gl_formats table, and there is supposed to
+// be exactly 1 matching entry (which tells you format/internal format).
+static const int special_formats[][2] = {
+    {IMGFMT_RGB565,     GL_UNSIGNED_SHORT_5_6_5},
+    {IMGFMT_UYVY,       GL_UNSIGNED_SHORT_8_8_APPLE},
+    {IMGFMT_YUYV,       GL_UNSIGNED_SHORT_8_8_REV_APPLE},
+    {0}
+};
+
+// Return an or-ed combination of all F_ flags that apply.
+int gl_format_feature_flags(GL *gl)
+{
+    return (gl->version == 210 ? F_GL2 : 0)
+         | (gl->version >= 300 ? F_GL3 : 0)
+         | (gl->es == 200 ? F_ES2 : 0)
+         | (gl->es >= 300 ? F_ES3 : 0)
+         | (gl->es >= 320 ? F_ES32 : 0)
+         | (gl->mpgl_caps & MPGL_CAP_EXT16 ? F_EXT16 : 0)
+         | ((gl->es >= 300 &&
+            (gl->mpgl_caps & MPGL_CAP_EXT_CR_HFLOAT)) ? F_EXTF16 : 0)
+         | ((gl->version == 210 &&
+            (gl->mpgl_caps & MPGL_CAP_ARB_FLOAT) &&
+            (gl->mpgl_caps & MPGL_CAP_TEX_RG) &&
+            (gl->mpgl_caps & MPGL_CAP_FB)) ? F_GL2F : 0)
+         | (gl->mpgl_caps & MPGL_CAP_APPLE_RGB_422 ? F_APPL : 0);
+}
+
+// Return the entry for the given internal format. Return NULL if unsupported.
+const struct gl_format *gl_find_internal_format(GL *gl, GLint internal_format)
+{
+    int features = gl_format_feature_flags(gl);
+    for (int n = 0; gl_formats[n].type; n++) {
+        const struct gl_format *f = &gl_formats[n];
+        if (f->internal_format == internal_format && (f->flags & features))
+            return f;
+    }
+    return NULL;
+}
+
+const struct gl_format *gl_find_special_format(GL *gl, int mpfmt)
+{
+    int features = gl_format_feature_flags(gl);
+    for (int n = 0; special_formats[n][0]; n++) {
+        if (special_formats[n][0] == mpfmt) {
+            GLenum type = special_formats[n][1];
+            for (int i = 0; gl_formats[i].type; i++) {
+                const struct gl_format *f = &gl_formats[i];
+                if (f->type == type && (f->flags & features))
+                    return f;
+            }
+            break;
+        }
+    }
+    return NULL;
+}
+
+// type: one of MPGL_TYPE_*
+// flags: bitset of F_*, all flags must be present
+const struct gl_format *gl_find_format(GL *gl, int type, int flags,
+                                       int bytes_per_component, int n_components)
+{
+    if (!bytes_per_component || !n_components || !type)
+        return NULL;
+    int features = gl_format_feature_flags(gl);
+    for (int n = 0; gl_formats[n].type; n++) {
+        const struct gl_format *f = &gl_formats[n];
+        if ((f->flags & features) &&
+            ((f->flags & flags) == flags) &&
+            gl_format_type(f) == type &&
+            gl_component_size(f->type) == bytes_per_component &&
+            gl_format_components(f->format) == n_components)
+            return f;
+    }
+    return NULL;
+}
+
+// Return a texture-filterable unsigned normalized fixed point format.
+const struct gl_format *gl_find_unorm_format(GL *gl, int bytes_per_component,
+                                             int n_components)
+{
+    return gl_find_format(gl, MPGL_TYPE_UNORM, F_TF, bytes_per_component,
+                          n_components);
+}
+
+// Return an unsigned integer format.
+const struct gl_format *gl_find_uint_format(GL *gl, int bytes_per_component,
+                                            int n_components)
+{
+    return gl_find_format(gl, MPGL_TYPE_UINT, 0, bytes_per_component,
+                          n_components);
+}
+
+// Return a 16 bit float format. Note that this will return a GL_FLOAT format
+// with 32 bit per component; just the internal representation is smaller.
+// Some GL versions will allow upload with GL_HALF_FLOAT as well.
+const struct gl_format *gl_find_float16_format(GL *gl, int n_components)
+{
+    return gl_find_format(gl, MPGL_TYPE_FLOAT, F_F16, 4, n_components);
+}
+
+int gl_format_type(const struct gl_format *format)
+{
+    if (!format)
+        return 0;
+    if (format->type == GL_FLOAT)
+        return MPGL_TYPE_FLOAT;
+    if (gl_integer_format_to_base(format->format))
+        return MPGL_TYPE_UINT;
+    return MPGL_TYPE_UNORM;
+}
+
+// Return an integer pixel "format" to a base internal format.
+// Return 0 if it's not an integer format.
+GLenum gl_integer_format_to_base(GLenum format)
+{
+    switch (format) {
+    case GL_RED_INTEGER:        return GL_RED;
+    case GL_RG_INTEGER:         return GL_RG;
+    case GL_RGB_INTEGER:        return GL_RGB;
+    case GL_RGBA_INTEGER:       return GL_RGBA;
+    }
+    return 0;
+}
+
+// Return the number of bytes per component this format implies.
+// Returns 0 for formats with non-byte alignments and formats which
+// merge multiple components (like GL_UNSIGNED_SHORT_5_6_5).
+int gl_component_size(GLenum type)
+{
+    switch (type) {
+    case GL_UNSIGNED_BYTE:                      return 1;
+    case GL_UNSIGNED_SHORT:                     return 2;
+    case GL_FLOAT:                              return 4;
+    }
+    return 0;
+}
+
+// Return the number of a pixel "format".
+int gl_format_components(GLenum format)
+{
+    switch (format) {
+    case GL_RED:
+    case GL_RED_INTEGER:
+    case GL_LUMINANCE:
+        return 1;
+    case GL_RG:
+    case GL_RG_INTEGER:
+    case GL_LUMINANCE_ALPHA:
+        return 2;
+    case GL_RGB:
+    case GL_RGB_INTEGER:
+        return 3;
+    case GL_RGBA:
+    case GL_RGBA_INTEGER:
+        return 4;
+    }
+    return 0;
+}
+
+// return the number of bytes per pixel for the given format
+// does not handle all possible variants, just those used by mpv
+int gl_bytes_per_pixel(GLenum format, GLenum type)
+{
+    // Formats with merged components are special.
+    switch (type) {
+    case GL_UNSIGNED_INT_2_10_10_10_REV:        return 4;
+    case GL_UNSIGNED_SHORT_5_6_5:               return 2;
+    case GL_UNSIGNED_SHORT_8_8_APPLE:           return 2;
+    case GL_UNSIGNED_SHORT_8_8_REV_APPLE:       return 2;
+    }
+
+    return gl_format_components(format) * gl_component_size(type);
+}
diff --git a/video/out/opengl/formats.h b/video/out/opengl/formats.h
new file mode 100644
index 0000000..6ced4a7
--- /dev/null
+++ b/video/out/opengl/formats.h
@@ -0,0 +1,59 @@
+#ifndef MPGL_FORMATS_H_
+#define MPGL_FORMATS_H_
+
+#include "common.h"
+
+struct gl_format {
+    GLint internal_format;      // glTexImage argument
+    GLenum format;              // glTexImage argument
+    GLenum type;                // e.g. GL_UNSIGNED_SHORT
+    int flags;
+};
+
+extern const struct gl_format gl_formats[];
+
+enum {
+    // --- gl_format.flags
+
+    // Version flags. If at least 1 flag matches, the format entry is considered
+    // supported on the current GL context.
+    F_GL2       = 1 << 0, // GL2.1-only
+    F_GL3       = 1 << 1, // GL3.0 or later
+    F_ES2       = 1 << 2, // ES2-only
+    F_ES3       = 1 << 3, // ES3.0 or later
+    F_ES32      = 1 << 4, // ES3.2 or later
+    F_EXT16     = 1 << 5, // ES with GL_EXT_texture_norm16
+    F_EXTF16    = 1 << 6, // GL_EXT_color_buffer_half_float
+    F_GL2F      = 1 << 7, // GL2.1-only with texture_rg + texture_float + FBOs
+    F_APPL      = 1 << 8, // GL_APPLE_rgb_422
+
+    // Feature flags. They are additional and signal presence of features.
+    F_CR        = 1 << 16, // color-renderable
+    F_TF        = 1 << 17, // texture-filterable with GL_LINEAR
+    F_CF        = F_CR | F_TF,
+    F_F16       = 1 << 18, // uses half-floats (16 bit) internally, even though
+                           // the format is still GL_FLOAT (32 bit)
+
+    // --- Other constants.
+    MPGL_TYPE_UNORM = 1,
+    MPGL_TYPE_UINT = 2,
+    MPGL_TYPE_FLOAT = 3,
+};
+
+int gl_format_feature_flags(GL *gl);
+const struct gl_format *gl_find_internal_format(GL *gl, GLint internal_format);
+const struct gl_format *gl_find_special_format(GL *gl, int mpfmt);
+const struct gl_format *gl_find_format(GL *gl, int type, int flags,
+                                       int bytes_per_component, int n_components);
+const struct gl_format *gl_find_unorm_format(GL *gl, int bytes_per_component,
+                                             int n_components);
+const struct gl_format *gl_find_uint_format(GL *gl, int bytes_per_component,
+                                            int n_components);
+const struct gl_format *gl_find_float16_format(GL *gl, int n_components);
+int gl_format_type(const struct gl_format *format);
+GLenum gl_integer_format_to_base(GLenum format);
+int gl_component_size(GLenum type);
+int gl_format_components(GLenum format);
+int gl_bytes_per_pixel(GLenum format, GLenum type);
+
+#endif
diff --git a/video/out/opengl/header_fixes.h b/video/out/opengl/header_fixes.h
index 885c277..9953f7e 100644
--- a/video/out/opengl/header_fixes.h
+++ b/video/out/opengl/header_fixes.h
@@ -62,6 +62,10 @@
 #define GL_DEBUG_SEVERITY_NOTIFICATION    0x826B
 #endif
 
+#ifndef GL_BACK_LEFT
+#define GL_BACK_LEFT                      0x0402
+#endif
+
 #if HAVE_ANDROID_GL
 #define GL_UNSIGNED_BYTE_3_3_2            0x8032
 #define GL_UNSIGNED_BYTE_2_3_3_REV        0x8362
@@ -80,12 +84,34 @@
 #define GL_TEXTURE_LUMINANCE_SIZE 0x8060
 #define GL_R16 0x822A
 #define GL_RG16 0x822C
+#define GL_LUMINANCE8 0x8040
+#define GL_LUMINANCE8_ALPHA8 0x8045
 #define GL_LUMINANCE16 0x8042
 #define GL_LUMINANCE16_ALPHA16 0x8048
 #define GL_UNSIGNED_SHORT_8_8_APPLE 0x85BA
 #define GL_UNSIGNED_SHORT_8_8_REV_APPLE 0x85BB
 #endif
 
+// GL_ARB_timer_query and EXT_disjoint_timer_query
+#ifndef GL_TIME_ELAPSED
+// Same as GL_TIME_ELAPSED_EXT
+#define GL_TIME_ELAPSED 0x88BF
+#endif
+
+// GL_OES_EGL_image_external, GL_NV_EGL_stream_consumer_external
+#ifndef GL_TEXTURE_EXTERNAL_OES
+#define GL_TEXTURE_EXTERNAL_OES 0x8D65
+#endif
+
+// GL_ANGLE_translated_shader_source
+#ifndef GL_TRANSLATED_SHADER_SOURCE_LENGTH_ANGLE
+#define GL_TRANSLATED_SHADER_SOURCE_LENGTH_ANGLE 0x93A0
+#endif
+
+#ifndef GL_RGB_RAW_422_APPLE
+#define GL_RGB_RAW_422_APPLE 0x8A51
+#endif
+
 #undef MP_GET_GL_WORKAROUNDS
 
 #endif // MP_GET_GL_WORKAROUNDS
diff --git a/video/out/opengl/hwdec.c b/video/out/opengl/hwdec.c
index b58af9b..8c82861 100644
--- a/video/out/opengl/hwdec.c
+++ b/video/out/opengl/hwdec.c
@@ -29,6 +29,8 @@ extern const struct gl_hwdec_driver gl_hwdec_vaglx;
 extern const struct gl_hwdec_driver gl_hwdec_videotoolbox;
 extern const struct gl_hwdec_driver gl_hwdec_vdpau;
 extern const struct gl_hwdec_driver gl_hwdec_dxva2egl;
+extern const struct gl_hwdec_driver gl_hwdec_d3d11egl;
+extern const struct gl_hwdec_driver gl_hwdec_d3d11eglrgb;
 extern const struct gl_hwdec_driver gl_hwdec_dxva2gldx;
 extern const struct gl_hwdec_driver gl_hwdec_dxva2;
 
@@ -45,8 +47,10 @@ static const struct gl_hwdec_driver *const mpgl_hwdec_drivers[] = {
 #if HAVE_VIDEOTOOLBOX_GL
     &gl_hwdec_videotoolbox,
 #endif
-#if HAVE_DXVA2_HWACCEL
+#if HAVE_D3D_HWACCEL
 #if HAVE_EGL_ANGLE
+    &gl_hwdec_d3d11egl,
+    &gl_hwdec_d3d11eglrgb,
     &gl_hwdec_dxva2egl,
 #endif
 #if HAVE_GL_DXINTEROP
@@ -59,6 +63,7 @@ static const struct gl_hwdec_driver *const mpgl_hwdec_drivers[] = {
 
 static struct gl_hwdec *load_hwdec_driver(struct mp_log *log, GL *gl,
                                           struct mpv_global *global,
+                                          struct mp_hwdec_devices *devs,
                                           const struct gl_hwdec_driver *drv,
                                           bool is_auto)
 {
@@ -68,7 +73,7 @@ static struct gl_hwdec *load_hwdec_driver(struct mp_log *log, GL *gl,
         .log = mp_log_new(hwdec, log, drv->name),
         .global = global,
         .gl = gl,
-        .gl_texture_target = GL_TEXTURE_2D,
+        .devs = devs,
         .probing = is_auto,
     };
     mp_verbose(log, "Loading hwdec driver '%s'\n", drv->name);
@@ -80,14 +85,16 @@ static struct gl_hwdec *load_hwdec_driver(struct mp_log *log, GL *gl,
     return hwdec;
 }
 
-struct gl_hwdec *gl_hwdec_load_api_id(struct mp_log *log, GL *gl,
-                                      struct mpv_global *g, int id)
+struct gl_hwdec *gl_hwdec_load_api(struct mp_log *log, GL *gl,
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api)
 {
-    bool is_auto = id == HWDEC_AUTO;
+    bool is_auto = HWDEC_IS_AUTO(api);
     for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
         const struct gl_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if (is_auto || id == drv->api) {
-            struct gl_hwdec *r = load_hwdec_driver(log, gl, g, drv, is_auto);
+        if (is_auto || api == drv->api) {
+            struct gl_hwdec *r = load_hwdec_driver(log, gl, g, devs, drv, is_auto);
             if (r)
                 return r;
         }
@@ -95,19 +102,6 @@ struct gl_hwdec *gl_hwdec_load_api_id(struct mp_log *log, GL *gl,
     return NULL;
 }
 
-// Like gl_hwdec_load_api_id(), but use option names.
-struct gl_hwdec *gl_hwdec_load_api(struct mp_log *log, GL *gl,
-                                   struct mpv_global *g, const char *api_name)
-{
-    int id = HWDEC_NONE;
-    for (const struct m_opt_choice_alternatives *c = mp_hwdec_names; c->name; c++)
-    {
-        if (strcmp(c->name, api_name) == 0)
-            id = c->value;
-    }
-    return gl_hwdec_load_api_id(log, gl, g, id);
-}
-
 void gl_hwdec_uninit(struct gl_hwdec *hwdec)
 {
     if (hwdec)
diff --git a/video/out/opengl/hwdec.h b/video/out/opengl/hwdec.h
index 5126d7f..29ccd18 100644
--- a/video/out/opengl/hwdec.h
+++ b/video/out/opengl/hwdec.h
@@ -4,54 +4,61 @@
 #include "common.h"
 #include "video/hwdec.h"
 
-struct mp_hwdec_info;
-
 struct gl_hwdec {
     const struct gl_hwdec_driver *driver;
     struct mp_log *log;
     struct mpv_global *global;
     GL *gl;
-    struct mp_hwdec_ctx *hwctx;
+    struct mp_hwdec_devices *devs;
+    // GLSL extensions required to sample textures from this.
+    const char **glsl_extensions;
     // For free use by hwdec driver
     void *priv;
     // For working around the vdpau vs. vaapi mess.
     bool probing;
-    // hwdec backends must set this to an IMGFMT_ that has an equivalent
-    // internal representation in gl_video.c as the hardware texture.
-    // It's used to build the rendering chain. For example, setting it to
-    // IMGFMT_RGB0 indicates that the video texture is RGB.
-    int converted_imgfmt;
-    // Normally this is GL_TEXTURE_2D, but the hwdec driver can set it to
-    // GL_TEXTURE_RECTANGLE. This is needed because VideoToolbox is shit.
-    GLenum gl_texture_target;
+};
+
+struct gl_hwdec_plane {
+    GLuint gl_texture;
+    GLenum gl_target;
+    int tex_w, tex_h; // allocated texture size
+    char swizzle[5]; // component order (if length is 0, use defaults)
+};
+
+struct gl_hwdec_frame {
+    struct gl_hwdec_plane planes[4];
+    bool vdpau_fields;
 };
 
 struct gl_hwdec_driver {
-    // Name of the interop backend. This is used for logging only.
+    // Name of the interop backend. This is used for informational purposes only.
     const char *name;
     // Used to explicitly request a specific API.
     enum hwdec_type api;
     // The hardware surface IMGFMT_ that must be passed to map_image later.
     int imgfmt;
-    // Create the hwdec device. It must fill in hw->info, if applicable.
-    // This also must set hw->converted_imgfmt.
+    // Create the hwdec device. It must add it to hw->devs, if applicable.
     int (*create)(struct gl_hwdec *hw);
     // Prepare for rendering video. (E.g. create textures.)
     // Called on initialization, and every time the video size changes.
     // *params must be set to the format the hw textures return.
-    // This also can update hw->converted_imgfmt.
     int (*reinit)(struct gl_hwdec *hw, struct mp_image_params *params);
     // Return textures that contain a copy or reference of the given hw_image.
-    int (*map_image)(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures);
+    // The textures mirror the format returned by the reinit params argument.
+    // The textures must remain valid until unmap is called.
+    // hw_image remains referenced by the caller until unmap is called.
+    int (*map_frame)(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame);
+    // Must be idempotent.
+    void (*unmap)(struct gl_hwdec *hw);
 
     void (*destroy)(struct gl_hwdec *hw);
 };
 
 struct gl_hwdec *gl_hwdec_load_api(struct mp_log *log, GL *gl,
-                                   struct mpv_global *g, const char *api_name);
-struct gl_hwdec *gl_hwdec_load_api_id(struct mp_log *log, GL *gl,
-                                      struct mpv_global *g, int id);
+                                   struct mpv_global *g,
+                                   struct mp_hwdec_devices *devs,
+                                   enum hwdec_type api);
 
 void gl_hwdec_uninit(struct gl_hwdec *hwdec);
 
diff --git a/video/out/opengl/hwdec_d3d11egl.c b/video/out/opengl/hwdec_d3d11egl.c
new file mode 100644
index 0000000..549d3f5
--- /dev/null
+++ b/video/out/opengl/hwdec_d3d11egl.c
@@ -0,0 +1,335 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <assert.h>
+#include <windows.h>
+#include <d3d11.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#include "angle_common.h"
+#include "angle_dynamic.h"
+
+#include "common/common.h"
+#include "osdep/timer.h"
+#include "osdep/windows_utils.h"
+#include "hwdec.h"
+#include "video/hwdec.h"
+
+#ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
+#define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
+#endif
+
+struct priv {
+    struct mp_hwdec_ctx hwctx;
+
+    ID3D11Device *d3d11_device;
+    EGLDisplay egl_display;
+
+    EGLStreamKHR egl_stream;
+    GLuint gl_textures[3];
+
+    // EGL_KHR_stream
+    EGLStreamKHR (EGLAPIENTRY *CreateStreamKHR)(EGLDisplay dpy,
+                                                const EGLint *attrib_list);
+    EGLBoolean (EGLAPIENTRY *DestroyStreamKHR)(EGLDisplay dpy,
+                                               EGLStreamKHR stream);
+
+    // EGL_KHR_stream_consumer_gltexture
+    EGLBoolean (EGLAPIENTRY *StreamConsumerAcquireKHR)
+                                        (EGLDisplay dpy, EGLStreamKHR stream);
+    EGLBoolean (EGLAPIENTRY *StreamConsumerReleaseKHR)
+                                        (EGLDisplay dpy, EGLStreamKHR stream);
+
+    // EGL_NV_stream_consumer_gltexture_yuv
+    EGLBoolean (EGLAPIENTRY *StreamConsumerGLTextureExternalAttribsNV)
+                (EGLDisplay dpy, EGLStreamKHR stream, EGLAttrib *attrib_list);
+
+    // EGL_ANGLE_stream_producer_d3d_texture_nv12
+    EGLBoolean (EGLAPIENTRY *CreateStreamProducerD3DTextureNV12ANGLE)
+            (EGLDisplay dpy, EGLStreamKHR stream, const EGLAttrib *attrib_list);
+    EGLBoolean (EGLAPIENTRY *StreamPostD3DTextureNV12ANGLE)
+            (EGLDisplay dpy, EGLStreamKHR stream, void *texture,
+             const EGLAttrib *attrib_list);
+};
+
+static void destroy_objects(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    GL *gl = hw->gl;
+
+    if (p->egl_stream)
+        p->DestroyStreamKHR(p->egl_display, p->egl_stream);
+    p->egl_stream = 0;
+
+    for (int n = 0; n < 3; n++) {
+        gl->DeleteTextures(1, &p->gl_textures[n]);
+        p->gl_textures[n] = 0;
+    }
+}
+
+static void destroy(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+
+    destroy_objects(hw);
+
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+
+    if (p->d3d11_device)
+        ID3D11Device_Release(p->d3d11_device);
+    p->d3d11_device = NULL;
+}
+
+static int create(struct gl_hwdec *hw)
+{
+    if (!angle_load())
+        return -1;
+
+    EGLDisplay egl_display = eglGetCurrentDisplay();
+    if (!egl_display)
+        return -1;
+
+    if (!eglGetCurrentContext())
+        return -1;
+
+    const char *exts = eglQueryString(egl_display, EGL_EXTENSIONS);
+    if (!exts || !strstr(exts, "EGL_ANGLE_d3d_share_handle_client_buffer") ||
+        !strstr(exts, "EGL_ANGLE_stream_producer_d3d_texture_nv12") ||
+        !(strstr(hw->gl->extensions, "GL_OES_EGL_image_external_essl3") ||
+          hw->gl->es == 200) ||
+        !strstr(exts, "EGL_EXT_device_query") ||
+        !(hw->gl->mpgl_caps & MPGL_CAP_TEX_RG))
+        return -1;
+
+    HRESULT hr;
+    struct priv *p = talloc_zero(hw, struct priv);
+    hw->priv = p;
+
+    p->egl_display = egl_display;
+
+    p->CreateStreamKHR = (void *)eglGetProcAddress("eglCreateStreamKHR");
+    p->DestroyStreamKHR = (void *)eglGetProcAddress("eglDestroyStreamKHR");
+    p->StreamConsumerAcquireKHR =
+        (void *)eglGetProcAddress("eglStreamConsumerAcquireKHR");
+    p->StreamConsumerReleaseKHR =
+        (void *)eglGetProcAddress("eglStreamConsumerReleaseKHR");
+    p->StreamConsumerGLTextureExternalAttribsNV =
+        (void *)eglGetProcAddress("eglStreamConsumerGLTextureExternalAttribsNV");
+    p->CreateStreamProducerD3DTextureNV12ANGLE =
+        (void *)eglGetProcAddress("eglCreateStreamProducerD3DTextureNV12ANGLE");
+    p->StreamPostD3DTextureNV12ANGLE =
+        (void *)eglGetProcAddress("eglStreamPostD3DTextureNV12ANGLE");
+
+    if (!p->CreateStreamKHR || !p->DestroyStreamKHR ||
+        !p->StreamConsumerAcquireKHR || !p->StreamConsumerReleaseKHR ||
+        !p->StreamConsumerGLTextureExternalAttribsNV ||
+        !p->CreateStreamProducerD3DTextureNV12ANGLE ||
+        !p->StreamPostD3DTextureNV12ANGLE)
+    {
+        MP_ERR(hw, "Failed to load some EGLStream functions.\n");
+        goto fail;
+    }
+
+    static const char *es2_exts[] = {"GL_NV_EGL_stream_consumer_external", 0};
+    static const char *es3_exts[] = {"GL_NV_EGL_stream_consumer_external",
+                                     "GL_OES_EGL_image_external_essl3", 0};
+    hw->glsl_extensions = hw->gl->es == 200 ? es2_exts : es3_exts;
+
+    PFNEGLQUERYDISPLAYATTRIBEXTPROC p_eglQueryDisplayAttribEXT =
+        (void *)eglGetProcAddress("eglQueryDisplayAttribEXT");
+    PFNEGLQUERYDEVICEATTRIBEXTPROC p_eglQueryDeviceAttribEXT =
+        (void *)eglGetProcAddress("eglQueryDeviceAttribEXT");
+    if (!p_eglQueryDisplayAttribEXT || !p_eglQueryDeviceAttribEXT)
+        goto fail;
+
+    EGLAttrib device = 0;
+    if (!p_eglQueryDisplayAttribEXT(egl_display, EGL_DEVICE_EXT, &device))
+        goto fail;
+    EGLAttrib d3d_device = 0;
+    if (!p_eglQueryDeviceAttribEXT((EGLDeviceEXT)device,
+                                    EGL_D3D11_DEVICE_ANGLE, &d3d_device))
+    {
+        MP_ERR(hw, "Could not get EGL_D3D11_DEVICE_ANGLE from ANGLE.\n");
+        goto fail;
+    }
+
+    p->d3d11_device = (ID3D11Device *)d3d_device;
+    if (!p->d3d11_device)
+        goto fail;
+    ID3D11Device_AddRef(p->d3d11_device);
+
+    if (!d3d11_check_decoding(p->d3d11_device)) {
+        MP_VERBOSE(hw, "D3D11 video decoding not supported on this system.\n");
+        goto fail;
+    }
+
+    ID3D10Multithread *multithread;
+    hr = ID3D11Device_QueryInterface(p->d3d11_device, &IID_ID3D10Multithread,
+                                     (void **)&multithread);
+    if (FAILED(hr)) {
+        MP_ERR(hw, "Failed to get Multithread interface: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto fail;
+    }
+    ID3D10Multithread_SetMultithreadProtected(multithread, TRUE);
+    ID3D10Multithread_Release(multithread);
+
+    p->hwctx = (struct mp_hwdec_ctx){
+        .type = HWDEC_D3D11VA,
+        .driver_name = hw->driver->name,
+        .ctx = p->d3d11_device,
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
+
+    return 0;
+fail:
+    destroy(hw);
+    return -1;
+}
+
+static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
+{
+    struct priv *p = hw->priv;
+    GL *gl = hw->gl;
+
+    destroy_objects(hw);
+
+    if (params->hw_subfmt != IMGFMT_NV12) {
+        MP_FATAL(hw, "Format not supported.\n");
+        return -1;
+    }
+
+    // Hope that the given texture unit range is not "in use" by anything.
+    // The texture units need to be bound during init only, and are free for
+    // use again after the initialization here is done.
+    int texunits = 0; // [texunits, texunits + num_planes)
+    int num_planes = 2;
+    int gl_target = GL_TEXTURE_EXTERNAL_OES;
+
+    p->egl_stream = p->CreateStreamKHR(p->egl_display, (EGLint[]){EGL_NONE});
+    if (!p->egl_stream)
+        goto fail;
+
+    for (int n = 0; n < num_planes; n++) {
+        gl->ActiveTexture(GL_TEXTURE0 + texunits + n);
+        gl->GenTextures(1, &p->gl_textures[n]);
+        gl->BindTexture(gl_target, p->gl_textures[n]);
+        gl->TexParameteri(gl_target, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+        gl->TexParameteri(gl_target, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+        gl->TexParameteri(gl_target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+        gl->TexParameteri(gl_target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    }
+
+    EGLAttrib attrs[] = {
+        EGL_COLOR_BUFFER_TYPE,          EGL_YUV_BUFFER_EXT,
+        EGL_YUV_NUMBER_OF_PLANES_EXT,   num_planes,
+        EGL_YUV_PLANE0_TEXTURE_UNIT_NV, texunits + 0,
+        EGL_YUV_PLANE1_TEXTURE_UNIT_NV, texunits + 1,
+        EGL_NONE,
+    };
+
+    if (!p->StreamConsumerGLTextureExternalAttribsNV(p->egl_display, p->egl_stream,
+                                                     attrs))
+        goto fail;
+
+    if (!p->CreateStreamProducerD3DTextureNV12ANGLE(p->egl_display, p->egl_stream,
+                                                    (EGLAttrib[]){EGL_NONE}))
+        goto fail;
+
+    params->imgfmt = params->hw_subfmt;
+
+    for (int n = 0; n < num_planes; n++) {
+        gl->ActiveTexture(GL_TEXTURE0 + texunits + n);
+        gl->BindTexture(gl_target, 0);
+    }
+    gl->ActiveTexture(GL_TEXTURE0);
+    return 0;
+fail:
+    MP_ERR(hw, "Failed to create EGLStream\n");
+    if (p->egl_stream)
+        p->DestroyStreamKHR(p->egl_display, p->egl_stream);
+    p->egl_stream = 0;
+    gl->ActiveTexture(GL_TEXTURE0);
+    return -1;
+}
+
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
+{
+    struct priv *p = hw->priv;
+
+    if (!p->gl_textures[0])
+        return -1;
+
+    ID3D11Texture2D *d3d_tex = (void *)hw_image->planes[1];
+    int d3d_subindex = (intptr_t)hw_image->planes[2];
+    if (!d3d_tex)
+        return -1;
+
+    EGLAttrib attrs[] = {
+        EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE, d3d_subindex,
+        EGL_NONE,
+    };
+    if (!p->StreamPostD3DTextureNV12ANGLE(p->egl_display, p->egl_stream,
+                                          (void *)d3d_tex, attrs))
+        return -1;
+
+    if (!p->StreamConsumerAcquireKHR(p->egl_display, p->egl_stream))
+        return -1;
+
+    D3D11_TEXTURE2D_DESC texdesc;
+    ID3D11Texture2D_GetDesc(d3d_tex, &texdesc);
+
+    *out_frame = (struct gl_hwdec_frame){
+        .planes = {
+            {
+                .gl_texture = p->gl_textures[0],
+                .gl_target = GL_TEXTURE_EXTERNAL_OES,
+                .tex_w = texdesc.Width,
+                .tex_h = texdesc.Height,
+            },
+            {
+                .gl_texture = p->gl_textures[1],
+                .gl_target = GL_TEXTURE_EXTERNAL_OES,
+                .tex_w = texdesc.Width / 2,
+                .tex_h = texdesc.Height / 2,
+            },
+        },
+    };
+    return 0;
+}
+
+static void unmap(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    if (p->egl_stream)
+        p->StreamConsumerReleaseKHR(p->egl_display, p->egl_stream);
+}
+
+const struct gl_hwdec_driver gl_hwdec_d3d11egl = {
+    .name = "d3d11-egl",
+    .api = HWDEC_D3D11VA,
+    .imgfmt = IMGFMT_D3D11NV12,
+    .create = create,
+    .reinit = reinit,
+    .map_frame = map_frame,
+    .unmap = unmap,
+    .destroy = destroy,
+};
diff --git a/video/out/opengl/hwdec_d3d11eglrgb.c b/video/out/opengl/hwdec_d3d11eglrgb.c
new file mode 100644
index 0000000..2e61189
--- /dev/null
+++ b/video/out/opengl/hwdec_d3d11eglrgb.c
@@ -0,0 +1,268 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <initguid.h>
+#include <assert.h>
+#include <windows.h>
+#include <d3d11.h>
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
+#include "angle_common.h"
+#include "angle_dynamic.h"
+
+#include "common/common.h"
+#include "osdep/timer.h"
+#include "osdep/windows_utils.h"
+#include "hwdec.h"
+#include "video/hwdec.h"
+
+#ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
+#define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
+#endif
+
+struct priv {
+    struct mp_hwdec_ctx hwctx;
+
+    ID3D11Device *d3d11_device;
+
+    EGLDisplay egl_display;
+    EGLConfig  egl_config;
+    EGLSurface egl_surface;
+
+    GLuint gl_texture;
+};
+
+static void unmap(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    if (p->egl_surface) {
+        eglReleaseTexImage(p->egl_display, p->egl_surface, EGL_BACK_BUFFER);
+        eglDestroySurface(p->egl_display, p->egl_surface);
+    }
+    p->egl_surface = NULL;
+}
+
+static void destroy_objects(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    GL *gl = hw->gl;
+
+    unmap(hw);
+
+    gl->DeleteTextures(1, &p->gl_texture);
+    p->gl_texture = 0;
+}
+
+static void destroy(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+
+    destroy_objects(hw);
+
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+
+    if (p->d3d11_device)
+        ID3D11Device_Release(p->d3d11_device);
+    p->d3d11_device = NULL;
+}
+
+static int create(struct gl_hwdec *hw)
+{
+    if (!angle_load())
+        return -1;
+
+    EGLDisplay egl_display = eglGetCurrentDisplay();
+    if (!egl_display)
+        return -1;
+
+    if (!eglGetCurrentContext())
+        return -1;
+
+    const char *exts = eglQueryString(egl_display, EGL_EXTENSIONS);
+    if (!exts || !strstr(exts, "EGL_ANGLE_d3d_share_handle_client_buffer"))
+        return -1;
+
+    HRESULT hr;
+    struct priv *p = talloc_zero(hw, struct priv);
+    hw->priv = p;
+
+    p->egl_display = egl_display;
+
+    HANDLE d3d11_dll = GetModuleHandleW(L"d3d11.dll");
+    if (!d3d11_dll) {
+        if (!hw->probing)
+            MP_ERR(hw, "Failed to load D3D11 library\n");
+        goto fail;
+    }
+
+    PFN_D3D11_CREATE_DEVICE CreateDevice =
+        (void *)GetProcAddress(d3d11_dll, "D3D11CreateDevice");
+    if (!CreateDevice)
+        goto fail;
+
+    hr = CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL,
+                      D3D11_CREATE_DEVICE_VIDEO_SUPPORT, NULL, 0,
+                      D3D11_SDK_VERSION, &p->d3d11_device, NULL, NULL);
+    if (FAILED(hr)) {
+        int lev = hw->probing ? MSGL_V : MSGL_ERR;
+        mp_msg(hw->log, lev, "Failed to create D3D11 Device: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto fail;
+    }
+
+    ID3D10Multithread *multithread;
+    hr = ID3D11Device_QueryInterface(p->d3d11_device, &IID_ID3D10Multithread,
+                                     (void **)&multithread);
+    if (FAILED(hr)) {
+        ID3D10Multithread_Release(multithread);
+        MP_ERR(hw, "Failed to get Multithread interface: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto fail;
+    }
+    ID3D10Multithread_SetMultithreadProtected(multithread, TRUE);
+    ID3D10Multithread_Release(multithread);
+
+    if (!d3d11_check_decoding(p->d3d11_device)) {
+        MP_VERBOSE(hw, "D3D11 video decoding not supported on this system.\n");
+        goto fail;
+    }
+
+    EGLint attrs[] = {
+        EGL_BUFFER_SIZE, 32,
+        EGL_RED_SIZE, 8,
+        EGL_GREEN_SIZE, 8,
+        EGL_BLUE_SIZE, 8,
+        EGL_SURFACE_TYPE, EGL_PBUFFER_BIT,
+        EGL_ALPHA_SIZE, 8,
+        EGL_BIND_TO_TEXTURE_RGBA, EGL_TRUE,
+        EGL_NONE
+    };
+    EGLint count;
+    if (!eglChooseConfig(p->egl_display, attrs, &p->egl_config, 1, &count) ||
+        !count) {
+        MP_ERR(hw, "Failed to get EGL surface configuration\n");
+        goto fail;
+    }
+
+    p->hwctx = (struct mp_hwdec_ctx){
+        .type = HWDEC_D3D11VA,
+        .driver_name = hw->driver->name,
+        .ctx = p->d3d11_device,
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
+
+    return 0;
+fail:
+    destroy(hw);
+    return -1;
+}
+
+static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
+{
+    struct priv *p = hw->priv;
+    GL *gl = hw->gl;
+
+    destroy_objects(hw);
+
+    gl->GenTextures(1, &p->gl_texture);
+    gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
+    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    gl->BindTexture(GL_TEXTURE_2D, 0);
+
+    params->imgfmt = IMGFMT_RGB0;
+    return 0;
+}
+
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
+{
+    struct priv *p = hw->priv;
+    GL *gl = hw->gl;
+    HRESULT hr;
+
+    if (!p->gl_texture)
+        return -1;
+
+    ID3D11Texture2D *d3d_tex = (void *)hw_image->planes[1];
+    if (!d3d_tex)
+        return -1;
+
+    IDXGIResource *res;
+    hr = IUnknown_QueryInterface(d3d_tex, &IID_IDXGIResource, (void **)&res);
+    if (FAILED(hr))
+        return -1;
+
+    HANDLE share_handle = NULL;
+    hr = IDXGIResource_GetSharedHandle(res, &share_handle);
+    if (FAILED(hr))
+        share_handle = NULL;
+
+    IDXGIResource_Release(res);
+
+    if (!share_handle)
+        return -1;
+
+    D3D11_TEXTURE2D_DESC texdesc;
+    ID3D11Texture2D_GetDesc(d3d_tex, &texdesc);
+
+    EGLint attrib_list[] = {
+        EGL_WIDTH, texdesc.Width,
+        EGL_HEIGHT, texdesc.Height,
+        EGL_TEXTURE_FORMAT, EGL_TEXTURE_RGBA,
+        EGL_TEXTURE_TARGET, EGL_TEXTURE_2D,
+        EGL_NONE
+    };
+    p->egl_surface = eglCreatePbufferFromClientBuffer(
+        p->egl_display, EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE,
+        share_handle, p->egl_config, attrib_list);
+    if (p->egl_surface == EGL_NO_SURFACE) {
+        MP_ERR(hw, "Failed to create EGL surface\n");
+        return -1;
+    }
+
+    gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
+    eglBindTexImage(p->egl_display, p->egl_surface, EGL_BACK_BUFFER);
+    gl->BindTexture(GL_TEXTURE_2D, 0);
+
+    *out_frame = (struct gl_hwdec_frame){
+        .planes = {
+            {
+                .gl_texture = p->gl_texture,
+                .gl_target = GL_TEXTURE_2D,
+                .tex_w = texdesc.Width,
+                .tex_h = texdesc.Height,
+            },
+        },
+    };
+    return 0;
+}
+
+const struct gl_hwdec_driver gl_hwdec_d3d11eglrgb = {
+    .name = "d3d11-egl-rgb",
+    .api = HWDEC_D3D11VA,
+    .imgfmt = IMGFMT_D3D11RGB,
+    .create = create,
+    .reinit = reinit,
+    .map_frame = map_frame,
+    .unmap = unmap,
+    .destroy = destroy,
+};
diff --git a/video/out/opengl/hwdec_dxva2.c b/video/out/opengl/hwdec_dxva2.c
index f72c817..d832bb4 100644
--- a/video/out/opengl/hwdec_dxva2.c
+++ b/video/out/opengl/hwdec_dxva2.c
@@ -1,8 +1,9 @@
+#include <d3d9.h>
+
 #include "common/common.h"
 
 #include "hwdec.h"
 #include "utils.h"
-#include "video/d3d.h"
 #include "video/hwdec.h"
 
 // This does not provide real (zero-copy) interop - it merely exists for
@@ -10,36 +11,38 @@
 // may help with OpenGL fullscreen mode.
 
 struct priv {
-    struct mp_d3d_ctx ctx;
+    struct mp_hwdec_ctx hwctx;
 };
 
 static void destroy(struct gl_hwdec *hw)
 {
     struct priv *p = hw->priv;
-    if (p->ctx.d3d9_device)
-        IDirect3DDevice9_Release(p->ctx.d3d9_device);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+    if (p->hwctx.ctx)
+        IDirect3DDevice9_Release((IDirect3DDevice9 *)p->hwctx.ctx);
 }
 
 static int create(struct gl_hwdec *hw)
 {
     GL *gl = hw->gl;
-    if (hw->hwctx || !gl->MPGetNativeDisplay)
+    if (!gl->MPGetNativeDisplay)
         return -1;
 
     struct priv *p = talloc_zero(hw, struct priv);
     hw->priv = p;
 
-    p->ctx.d3d9_device = gl->MPGetNativeDisplay("IDirect3DDevice9");
-    if (!p->ctx.d3d9_device)
+    IDirect3DDevice9 *d3d = gl->MPGetNativeDisplay("IDirect3DDevice9");
+    if (!d3d)
         return -1;
 
-    p->ctx.hwctx.type = HWDEC_DXVA2_COPY;
-    p->ctx.hwctx.d3d_ctx = &p->ctx;
-
-    MP_VERBOSE(hw, "Using libmpv supplied device %p.\n", p->ctx.d3d9_device);
+    MP_VERBOSE(hw, "Using libmpv supplied device %p.\n", d3d);
 
-    hw->hwctx = &p->ctx.hwctx;
-    hw->converted_imgfmt = 0;
+    p->hwctx = (struct mp_hwdec_ctx){
+        .type = HWDEC_DXVA2_COPY,
+        .driver_name = hw->driver->name,
+        .ctx = d3d,
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
     return 0;
 }
 
@@ -48,8 +51,8 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     return -1;
 }
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
     return -1;
 }
@@ -60,6 +63,6 @@ const struct gl_hwdec_driver gl_hwdec_dxva2 = {
     .imgfmt = -1,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/hwdec_dxva2egl.c b/video/out/opengl/hwdec_dxva2egl.c
index eed9426..d67a85b 100644
--- a/video/out/opengl/hwdec_dxva2egl.c
+++ b/video/out/opengl/hwdec_dxva2egl.c
@@ -17,19 +17,21 @@
 
 #include <assert.h>
 #include <windows.h>
+#include <d3d9.h>
+
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
+#include "angle_dynamic.h"
+
 #include "common/common.h"
 #include "osdep/timer.h"
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
-#include "video/dxva2.h"
-#include "video/d3d.h"
 #include "video/hwdec.h"
 
 struct priv {
-    struct mp_d3d_ctx ctx;
+    struct mp_hwdec_ctx hwctx;
 
     HMODULE             d3d9_dll;
     IDirect3D9Ex       *d3d9ex;
@@ -77,6 +79,8 @@ static void destroy(struct gl_hwdec *hw)
 
     destroy_textures(hw);
 
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+
     if (p->query9)
         IDirect3DQuery9_Release(p->query9);
 
@@ -92,13 +96,16 @@ static void destroy(struct gl_hwdec *hw)
 
 static int create(struct gl_hwdec *hw)
 {
-    if (hw->hwctx)
+    if (!angle_load())
         return -1;
 
     EGLDisplay egl_display = eglGetCurrentDisplay();
     if (!egl_display)
         return -1;
 
+    if (!eglGetCurrentContext())
+        return -1;
+
     const char *exts = eglQueryString(egl_display, EGL_EXTENSIONS);
     if (!exts ||
         !strstr(exts, "EGL_ANGLE_d3d_share_handle_client_buffer")) {
@@ -202,13 +209,13 @@ static int create(struct gl_hwdec *hw)
         goto fail;
     }
 
-    hw->converted_imgfmt = IMGFMT_RGB0;
-
-    p->ctx.d3d9_device = (IDirect3DDevice9 *)p->device9ex;
-    p->ctx.hwctx.type = HWDEC_DXVA2;
-    p->ctx.hwctx.d3d_ctx = &p->ctx;
+    p->hwctx = (struct mp_hwdec_ctx){
+        .type = HWDEC_DXVA2,
+        .driver_name = hw->driver->name,
+        .ctx = (IDirect3DDevice9 *)p->device9ex,
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
 
-    hw->hwctx = &p->ctx.hwctx;
     return 0;
 fail:
     destroy(hw);
@@ -223,8 +230,6 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
 
     destroy_textures(hw);
 
-    assert(params->imgfmt == hw->driver->imgfmt);
-
     HANDLE share_handle = NULL;
     hr = IDirect3DDevice9Ex_CreateTexture(p->device9ex,
                                           params->w, params->h,
@@ -269,14 +274,15 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
+    params->imgfmt = IMGFMT_RGB0;
     return 0;
 fail:
     destroy_textures(hw);
     return -1;
 }
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
     struct priv *p = hw->priv;
     GL *gl = hw->gl;
@@ -285,7 +291,7 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
 
     HRESULT hr;
     RECT rc = {0, 0, hw_image->w, hw_image->h};
-    IDirect3DSurface9* hw_surface = d3d9_surface_in_mp_image(hw_image);
+    IDirect3DSurface9* hw_surface = (IDirect3DSurface9 *)hw_image->planes[3];
     hr = IDirect3DDevice9Ex_StretchRect(p->device9ex,
                                         hw_surface, &rc,
                                         p->surface9, &rc,
@@ -329,7 +335,16 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
     eglBindTexImage(p->egl_display, p->egl_surface, EGL_BACK_BUFFER);
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
-    out_textures[0] = p->gl_texture;
+    *out_frame = (struct gl_hwdec_frame){
+        .planes = {
+            {
+                .gl_texture = p->gl_texture,
+                .gl_target = GL_TEXTURE_2D,
+                .tex_w = hw_image->w,
+                .tex_h = hw_image->h,
+            },
+        },
+    };
     return 0;
 }
 
@@ -339,6 +354,6 @@ const struct gl_hwdec_driver gl_hwdec_dxva2egl = {
     .imgfmt = IMGFMT_DXVA2,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/hwdec_dxva2gldx.c b/video/out/opengl/hwdec_dxva2gldx.c
index 69be0cc..4cd8c1c 100644
--- a/video/out/opengl/hwdec_dxva2gldx.c
+++ b/video/out/opengl/hwdec_dxva2gldx.c
@@ -15,14 +15,13 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <d3d9.h>
 #include <assert.h>
 
 #include "common/common.h"
 #include "osdep/windows_utils.h"
 #include "hwdec.h"
 #include "video/hwdec.h"
-#include "video/d3d.h"
-#include "video/dxva2.h"
 
 // for  WGL_ACCESS_READ_ONLY_NV
 #include <GL/wglext.h>
@@ -30,7 +29,7 @@
 #define SHARED_SURFACE_D3DFMT D3DFMT_X8R8G8B8
 #define SHARED_SURFACE_MPFMT  IMGFMT_RGB0
 struct priv {
-    struct mp_d3d_ctx ctx;
+    struct mp_hwdec_ctx hwctx;
     IDirect3DDevice9Ex *device;
     HANDLE device_h;
 
@@ -74,6 +73,8 @@ static void destroy(struct gl_hwdec *hw)
     struct priv *p = hw->priv;
     destroy_objects(hw);
 
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+
     if (p->device)
         IDirect3DDevice9Ex_Release(p->device);
 }
@@ -81,10 +82,8 @@ static void destroy(struct gl_hwdec *hw)
 static int create(struct gl_hwdec *hw)
 {
     GL *gl = hw->gl;
-    if (hw->hwctx || !gl->MPGetNativeDisplay ||
-        !(gl->mpgl_caps & MPGL_CAP_DXINTEROP)) {
+    if (!gl->MPGetNativeDisplay || !(gl->mpgl_caps & MPGL_CAP_DXINTEROP))
         return -1;
-    }
 
     struct priv *p = talloc_zero(hw, struct priv);
     hw->priv = p;
@@ -100,13 +99,13 @@ static int create(struct gl_hwdec *hw)
     if (!p->device)
         return -1;
     IDirect3DDevice9Ex_AddRef(p->device);
-    p->ctx.d3d9_device = (IDirect3DDevice9 *)p->device;
-
-    p->ctx.hwctx.type = HWDEC_DXVA2;
-    p->ctx.hwctx.d3d_ctx = &p->ctx;
 
-    hw->hwctx = &p->ctx.hwctx;
-    hw->converted_imgfmt = SHARED_SURFACE_MPFMT;
+    p->hwctx = (struct mp_hwdec_ctx){
+        .type = HWDEC_DXVA2,
+        .driver_name = hw->driver->name,
+        .ctx = (IDirect3DDevice9 *)p->device,
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
     return 0;
 }
 
@@ -118,8 +117,6 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
 
     destroy_objects(hw);
 
-    assert(params->imgfmt == hw->driver->imgfmt);
-
     HANDLE share_handle = NULL;
     hr = IDirect3DDevice9Ex_CreateRenderTarget(
         p->device,
@@ -162,14 +159,16 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
         goto fail;
     }
 
+    params->imgfmt = SHARED_SURFACE_MPFMT;
+
     return 0;
 fail:
     destroy_objects(hw);
     return -1;
 }
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
     assert(hw_image && hw_image->imgfmt == hw->driver->imgfmt);
     GL *gl = hw->gl;
@@ -182,7 +181,7 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
         return -1;
     }
 
-    IDirect3DSurface9* hw_surface = d3d9_surface_in_mp_image(hw_image);
+    IDirect3DSurface9* hw_surface = (IDirect3DSurface9 *)hw_image->planes[3];
     RECT rc = {0, 0, hw_image->w, hw_image->h};
     hr = IDirect3DDevice9Ex_StretchRect(p->device,
                                         hw_surface, &rc,
@@ -199,7 +198,16 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
         return -1;
     }
 
-    out_textures[0] = p->texture;
+    *out_frame = (struct gl_hwdec_frame){
+        .planes = {
+            {
+                .gl_texture = p->texture,
+                .gl_target = GL_TEXTURE_2D,
+                .tex_w = hw_image->w,
+                .tex_h = hw_image->h,
+            },
+        },
+    };
     return 0;
 }
 
@@ -209,6 +217,6 @@ const struct gl_hwdec_driver gl_hwdec_dxva2gldx = {
     .imgfmt = IMGFMT_DXVA2,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/hwdec_osx.c b/video/out/opengl/hwdec_osx.c
index addc16f..6ddfa66 100644
--- a/video/out/opengl/hwdec_osx.c
+++ b/video/out/opengl/hwdec_osx.c
@@ -33,6 +33,7 @@ struct vt_gl_plane_format {
     GLenum gl_format;
     GLenum gl_type;
     GLenum gl_internal_format;
+    char swizzle[5];
 };
 
 struct vt_format {
@@ -43,9 +44,11 @@ struct vt_format {
 };
 
 struct priv {
+    struct mp_hwdec_ctx hwctx;
+    struct mp_vt_ctx vtctx;
+
     CVPixelBufferRef pbuf;
     GLuint gl_planes[MP_MAX_PLANES];
-    struct mp_hwdec_ctx hwctx;
 };
 
 static struct vt_format vt_formats[] = {
@@ -63,7 +66,7 @@ static struct vt_format vt_formats[] = {
         .imgfmt = IMGFMT_UYVY,
         .planes = 1,
         .gl = {
-            { GL_RGB_422_APPLE, GL_UNSIGNED_SHORT_8_8_APPLE, GL_RGB }
+            { GL_RGB_422_APPLE, GL_UNSIGNED_SHORT_8_8_APPLE, GL_RGB, "gbra" }
         }
     },
     {
@@ -147,9 +150,9 @@ static bool check_hwdec(struct gl_hwdec *hw)
     return true;
 }
 
-static uint32_t get_vt_fmt(struct mp_hwdec_ctx *ctx)
+static uint32_t get_vt_fmt(struct mp_vt_ctx *vtctx)
 {
-    struct gl_hwdec *hw = ctx->priv;
+    struct gl_hwdec *hw = vtctx->priv;
     struct vt_format *f =
         vt_get_gl_format_from_imgfmt(hw->global->opts->videotoolbox_format);
     return f ? f->cvpixfmt : (uint32_t)-1;
@@ -161,21 +164,21 @@ static int create(struct gl_hwdec *hw)
         return -1;
 
     struct priv *p = talloc_zero(hw, struct priv);
-    struct vt_format *f = vt_get_gl_format_from_imgfmt(IMGFMT_NV12);
-    if (!f)
-        return -1;
-
     hw->priv = p;
-    hw->converted_imgfmt = f->imgfmt;
-    hw->hwctx = &p->hwctx;
-    hw->hwctx->download_image = download_image;
-    hw->hwctx->type = HWDEC_VIDEOTOOLBOX;
-    hw->hwctx->get_vt_fmt = get_vt_fmt;
 
-    hw->gl_texture_target = GL_TEXTURE_RECTANGLE;
     hw->gl->GenTextures(MP_MAX_PLANES, p->gl_planes);
 
-    hw->hwctx->priv = hw;
+    p->vtctx = (struct mp_vt_ctx){
+        .priv = hw,
+        .get_vt_fmt = get_vt_fmt,
+    };
+    p->hwctx = (struct mp_hwdec_ctx){
+        .type = HWDEC_VIDEOTOOLBOX,
+        .download_image = download_image,
+        .ctx = &p->vtctx,
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
+
     return 0;
 }
 
@@ -189,16 +192,13 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
         return -1;
     }
 
-    hw->converted_imgfmt = f->imgfmt;
+    params->imgfmt = f->imgfmt;
     return 0;
 }
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
-    if (!check_hwdec(hw))
-        return -1;
-
     struct priv *p = hw->priv;
     GL *gl = hw->gl;
 
@@ -222,11 +222,13 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
     const int planes  = CVPixelBufferGetPlaneCount(p->pbuf);
     assert(planar && planes == f->planes || f->planes == 1);
 
+    GLenum gl_target = GL_TEXTURE_RECTANGLE;
+
     for (int i = 0; i < f->planes; i++) {
-        gl->BindTexture(hw->gl_texture_target, p->gl_planes[i]);
+        gl->BindTexture(gl_target, p->gl_planes[i]);
 
         CGLError err = CGLTexImageIOSurface2D(
-            CGLGetCurrentContext(), hw->gl_texture_target,
+            CGLGetCurrentContext(), gl_target,
             f->gl[i].gl_internal_format,
             IOSurfaceGetWidthOfPlane(surface, i),
             IOSurfaceGetHeightOfPlane(surface, i),
@@ -236,9 +238,16 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
             MP_ERR(hw, "error creating IOSurface texture for plane %d: %s (%x)\n",
                    i, CGLErrorString(err), gl->GetError());
 
-        gl->BindTexture(hw->gl_texture_target, 0);
+        gl->BindTexture(gl_target, 0);
 
-        out_textures[i] = p->gl_planes[i];
+        out_frame->planes[i] = (struct gl_hwdec_plane){
+            .gl_texture = p->gl_planes[i],
+            .gl_target = gl_target,
+            .tex_w = IOSurfaceGetWidthOfPlane(surface, i),
+            .tex_h = IOSurfaceGetHeightOfPlane(surface, i),
+        };
+        snprintf(out_frame->planes[i].swizzle, sizeof(out_frame->planes[i].swizzle),
+                 "%s", f->gl[i].swizzle);
     }
 
     return 0;
@@ -251,6 +260,8 @@ static void destroy(struct gl_hwdec *hw)
 
     CVPixelBufferRelease(p->pbuf);
     gl->DeleteTextures(MP_MAX_PLANES, p->gl_planes);
+
+    hwdec_devices_remove(hw->devs, &p->hwctx);
 }
 
 const struct gl_hwdec_driver gl_hwdec_videotoolbox = {
@@ -259,6 +270,6 @@ const struct gl_hwdec_driver gl_hwdec_videotoolbox = {
     .imgfmt = IMGFMT_VIDEOTOOLBOX,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/hwdec_vaegl.c b/video/out/opengl/hwdec_vaegl.c
index 7b34d6b..6c52cdd 100644
--- a/video/out/opengl/hwdec_vaegl.c
+++ b/video/out/opengl/hwdec_vaegl.c
@@ -114,7 +114,7 @@ struct priv {
     EGLImageKHR images[4];
     VAImage current_image;
     bool buffer_acquired;
-    struct mp_image *current_ref;
+    int current_mpfmt;
 
     EGLImageKHR (EGLAPIENTRY *CreateImageKHR)(EGLDisplay, EGLContext,
                                               EGLenum, EGLClientBuffer,
@@ -125,7 +125,7 @@ struct priv {
 
 static bool test_format(struct gl_hwdec *hw);
 
-static void unref_image(struct gl_hwdec *hw)
+static void unmap_frame(struct gl_hwdec *hw)
 {
     struct priv *p = hw->priv;
     VAStatus status;
@@ -149,8 +149,6 @@ static void unref_image(struct gl_hwdec *hw)
         p->current_image.image_id = VA_INVALID_ID;
     }
 
-    mp_image_unrefp(&p->current_ref);
-
     va_unlock(p->ctx);
 }
 
@@ -167,35 +165,13 @@ static void destroy_textures(struct gl_hwdec *hw)
 static void destroy(struct gl_hwdec *hw)
 {
     struct priv *p = hw->priv;
-    unref_image(hw);
+    unmap_frame(hw);
     destroy_textures(hw);
+    if (p->ctx)
+        hwdec_devices_remove(hw->devs, &p->ctx->hwctx);
     va_destroy(p->ctx);
 }
 
-// Create an empty dummy VPP. This works around a weird bug that affects the
-// VA surface format, as it is reported by vaDeriveImage(). Before a VPP
-// context or a decoder context is created, the surface format will be reported
-// as YV12. Surfaces created after context creation will report NV12 (even
-// though surface creation does not take a context as argument!). Existing
-// surfaces will change their format from YV12 to NV12 as soon as the decoder
-// renders to them! Because we want know the surface format in advance (to
-// simplify our renderer configuration logic), we hope that this hack gives
-// us reasonable behavior.
-// See: https://bugs.freedesktop.org/show_bug.cgi?id=79848
-static void insane_hack(struct gl_hwdec *hw)
-{
-    struct priv *p = hw->priv;
-    VAConfigID config;
-    if (vaCreateConfig(p->display, VAProfileNone, VAEntrypointVideoProc,
-                       NULL, 0, &config) == VA_STATUS_SUCCESS)
-    {
-        // We want to keep this until the VADisplay is destroyed. It will
-        // implicitly free the context.
-        VAContextID context;
-        vaCreateContext(p->display, config, 0, 0, 0, NULL, 0, &context);
-    }
-}
-
 static int create(struct gl_hwdec *hw)
 {
     GL *gl = hw->gl;
@@ -205,9 +181,7 @@ static int create(struct gl_hwdec *hw)
     p->current_image.buf = p->current_image.image_id = VA_INVALID_ID;
     p->log = hw->log;
 
-    if (hw->hwctx)
-        return -1;
-    if (!eglGetCurrentDisplay())
+    if (!eglGetCurrentContext())
         return -1;
 
     const char *exts = eglQueryString(eglGetCurrentDisplay(), EGL_EXTENSIONS);
@@ -248,13 +222,13 @@ static int create(struct gl_hwdec *hw)
 
     MP_VERBOSE(p, "using VAAPI EGL interop\n");
 
-    insane_hack(hw);
     if (!test_format(hw)) {
         destroy(hw);
         return -1;
     }
 
-    hw->hwctx = &p->ctx->hwctx;
+    p->ctx->hwctx.driver_name = hw->driver->name;
+    hwdec_devices_add(hw->devs, &p->ctx->hwctx);
     return 0;
 }
 
@@ -266,8 +240,6 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     // Recreate them to get rid of all previous image data (possibly).
     destroy_textures(hw);
 
-    assert(params->imgfmt == hw->driver->imgfmt);
-
     gl->GenTextures(4, p->gl_textures);
     for (int n = 0; n < 4; n++) {
         gl->BindTexture(GL_TEXTURE_2D, p->gl_textures[n]);
@@ -278,6 +250,20 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     }
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
+    p->current_mpfmt = va_fourcc_to_imgfmt(params->hw_subfmt);
+    if (p->current_mpfmt != IMGFMT_NV12 &&
+        p->current_mpfmt != IMGFMT_420P)
+    {
+        MP_FATAL(p, "unsupported VA image format %s\n",
+                 mp_tag_str(params->hw_subfmt));
+        return -1;
+    }
+
+    MP_VERBOSE(p, "format: %s %s\n", mp_tag_str(params->hw_subfmt),
+               mp_imgfmt_to_name(p->current_mpfmt));
+
+    params->imgfmt = p->current_mpfmt;
+
     return 0;
 }
 
@@ -289,17 +275,15 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     attribs[num_attribs] = EGL_NONE;                    \
     } while(0)
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
     struct priv *p = hw->priv;
     GL *gl = hw->gl;
     VAStatus status;
     VAImage *va_image = &p->current_image;
 
-    unref_image(hw);
-
-    mp_image_setrefp(&p->current_ref, hw_image);
+    unmap_frame(hw);
 
     va_lock(p->ctx);
 
@@ -308,21 +292,9 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
         goto err;
 
     int mpfmt = va_fourcc_to_imgfmt(va_image->format.fourcc);
-    if (mpfmt != IMGFMT_NV12 && mpfmt != IMGFMT_420P) {
-        MP_FATAL(p, "unsupported VA image format %s\n",
-                 mp_tag_str(va_image->format.fourcc));
-        goto err;
-    }
-
-    if (!hw->converted_imgfmt) {
-        MP_VERBOSE(p, "format: %s %s\n", mp_tag_str(va_image->format.fourcc),
-                   mp_imgfmt_to_name(mpfmt));
-        hw->converted_imgfmt = mpfmt;
-    }
-
-    if (hw->converted_imgfmt != mpfmt) {
+    if (p->current_mpfmt != mpfmt) {
         MP_FATAL(p, "mid-stream hwdec format change (%s -> %s) not supported\n",
-                 mp_imgfmt_to_name(hw->converted_imgfmt), mp_imgfmt_to_name(mpfmt));
+                 mp_imgfmt_to_name(p->current_mpfmt), mp_imgfmt_to_name(mpfmt));
         goto err;
     }
 
@@ -361,12 +333,17 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
         gl->BindTexture(GL_TEXTURE_2D, p->gl_textures[n]);
         p->EGLImageTargetTexture2DOES(GL_TEXTURE_2D, p->images[n]);
 
-        out_textures[n] = p->gl_textures[n];
+        out_frame->planes[n] = (struct gl_hwdec_plane){
+            .gl_texture = p->gl_textures[n],
+            .gl_target = GL_TEXTURE_2D,
+            .tex_w = mp_image_plane_w(&layout, n),
+            .tex_h = mp_image_plane_h(&layout, n),
+        };
     }
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
     if (va_image->format.fourcc == VA_FOURCC_YV12)
-        MPSWAP(GLuint, out_textures[1], out_textures[2]);
+        MPSWAP(struct gl_hwdec_plane, out_frame->planes[1], out_frame->planes[2]);
 
     va_unlock(p->ctx);
     return 0;
@@ -374,7 +351,7 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
 err:
     va_unlock(p->ctx);
     MP_FATAL(p, "mapping VAAPI EGL image failed\n");
-    unref_image(hw);
+    unmap_frame(hw);
     return -1;
 }
 
@@ -387,12 +364,13 @@ static bool test_format(struct gl_hwdec *hw)
     va_pool_set_allocator(alloc, p->ctx, VA_RT_FORMAT_YUV420);
     struct mp_image *surface = mp_image_pool_get(alloc, IMGFMT_VAAPI, 64, 64);
     if (surface) {
+        va_surface_init_subformat(surface);
         struct mp_image_params params = surface->params;
         if (reinit(hw, &params) >= 0) {
-            GLuint textures[4];
-            ok = map_image(hw, surface, textures) >= 0;
+            struct gl_hwdec_frame frame = {0};
+            ok = map_frame(hw, surface, &frame) >= 0;
         }
-        unref_image(hw);
+        unmap_frame(hw);
     }
     talloc_free(surface);
     talloc_free(alloc);
@@ -406,6 +384,7 @@ const struct gl_hwdec_driver gl_hwdec_vaegl = {
     .imgfmt = IMGFMT_VAAPI,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
+    .unmap = unmap_frame,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/hwdec_vaglx.c b/video/out/opengl/hwdec_vaglx.c
index 77b1f27..2e3017c 100644
--- a/video/out/opengl/hwdec_vaglx.c
+++ b/video/out/opengl/hwdec_vaglx.c
@@ -64,13 +64,13 @@ static void destroy(struct gl_hwdec *hw)
 {
     struct priv *p = hw->priv;
     destroy_texture(hw);
+    if (p->ctx)
+        hwdec_devices_remove(hw->devs, &p->ctx->hwctx);
     va_destroy(p->ctx);
 }
 
 static int create(struct gl_hwdec *hw)
 {
-    if (hw->hwctx)
-        return -1;
     Display *x11disp = glXGetCurrentDisplay();
     if (!x11disp)
         return -1;
@@ -126,8 +126,8 @@ static int create(struct gl_hwdec *hw)
         return -1;
     }
 
-    hw->hwctx = &p->ctx->hwctx;
-    hw->converted_imgfmt = IMGFMT_RGB0;
+    p->ctx->hwctx.driver_name = hw->driver->name;
+    hwdec_devices_add(hw->devs, &p->ctx->hwctx);
     return 0;
 }
 
@@ -138,8 +138,6 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
 
     destroy_texture(hw);
 
-    assert(params->imgfmt == hw->driver->imgfmt);
-
     gl->GenTextures(1, &p->gl_texture);
     gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
@@ -168,11 +166,13 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
     p->glXBindTexImage(p->xdisplay, p->glxpixmap, GLX_FRONT_EXT, NULL);
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
+    params->imgfmt = IMGFMT_RGB0;
+
     return 0;
 }
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
     struct priv *p = hw->priv;
     VAStatus status;
@@ -189,7 +189,16 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
     CHECK_VA_STATUS(p, "vaPutSurface()");
     va_unlock(p->ctx);
 
-    out_textures[0] = p->gl_texture;
+    *out_frame = (struct gl_hwdec_frame){
+        .planes = {
+            {
+                .gl_texture = p->gl_texture,
+                .gl_target = GL_TEXTURE_2D,
+                .tex_w = hw_image->w,
+                .tex_h = hw_image->h,
+            },
+        },
+    };
     return 0;
 }
 
@@ -199,6 +208,6 @@ const struct gl_hwdec_driver gl_hwdec_vaglx = {
     .imgfmt = IMGFMT_VAAPI,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/hwdec_vdpau.c b/video/out/opengl/hwdec_vdpau.c
index b1d4962..83f664a 100644
--- a/video/out/opengl/hwdec_vdpau.c
+++ b/video/out/opengl/hwdec_vdpau.c
@@ -36,20 +36,35 @@ struct priv {
     struct mp_vdpau_ctx *ctx;
     uint64_t preemption_counter;
     struct mp_image_params image_params;
-    GLuint gl_texture;
+    GLuint gl_textures[4];
     bool vdpgl_initialized;
     GLvdpauSurfaceNV vdpgl_surface;
     VdpOutputSurface vdp_surface;
     struct mp_vdpau_mixer *mixer;
+    bool direct_mode;
     bool mapped;
 };
 
+static void unmap(struct gl_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    GL *gl = hw->gl;
+
+    if (p->mapped) {
+        gl->VDPAUUnmapSurfacesNV(1, &p->vdpgl_surface);
+        if (p->direct_mode) {
+            gl->VDPAUUnregisterSurfaceNV(p->vdpgl_surface);
+            p->vdpgl_surface = 0;
+        }
+    }
+    p->mapped = false;
+}
+
 static void mark_vdpau_objects_uninitialized(struct gl_hwdec *hw)
 {
     struct priv *p = hw->priv;
 
     p->vdp_surface = VDP_INVALID_HANDLE;
-    p->mixer->video_mixer = VDP_INVALID_HANDLE;
     p->mapped = false;
 }
 
@@ -60,16 +75,15 @@ static void destroy_objects(struct gl_hwdec *hw)
     struct vdp_functions *vdp = &p->ctx->vdp;
     VdpStatus vdp_st;
 
-    if (p->mapped)
-        gl->VDPAUUnmapSurfacesNV(1, &p->vdpgl_surface);
-    p->mapped = false;
+    unmap(hw);
 
     if (p->vdpgl_surface)
         gl->VDPAUUnregisterSurfaceNV(p->vdpgl_surface);
     p->vdpgl_surface = 0;
 
-    glDeleteTextures(1, &p->gl_texture);
-    p->gl_texture = 0;
+    glDeleteTextures(4, p->gl_textures);
+    for (int n = 0; n < 4; n++)
+        p->gl_textures[n] = 0;
 
     if (p->vdp_surface != VDP_INVALID_HANDLE) {
         vdp_st = vdp->output_surface_destroy(p->vdp_surface);
@@ -77,14 +91,14 @@ static void destroy_objects(struct gl_hwdec *hw)
     }
     p->vdp_surface = VDP_INVALID_HANDLE;
 
-    glCheckError(gl, hw->log, "Before uninitializing OpenGL interop");
+    gl_check_error(gl, hw->log, "Before uninitializing OpenGL interop");
 
     if (p->vdpgl_initialized)
         gl->VDPAUFiniNV();
 
     p->vdpgl_initialized = false;
 
-    glCheckError(gl, hw->log, "After uninitializing OpenGL interop");
+    gl_check_error(gl, hw->log, "After uninitializing OpenGL interop");
 }
 
 static void destroy(struct gl_hwdec *hw)
@@ -93,14 +107,14 @@ static void destroy(struct gl_hwdec *hw)
 
     destroy_objects(hw);
     mp_vdpau_mixer_destroy(p->mixer);
+    if (p->ctx)
+        hwdec_devices_remove(hw->devs, &p->ctx->hwctx);
     mp_vdpau_destroy(p->ctx);
 }
 
 static int create(struct gl_hwdec *hw)
 {
     GL *gl = hw->gl;
-    if (hw->hwctx)
-        return -1;
     Display *x11disp = glXGetCurrentDisplay();
     if (!x11disp)
         return -1;
@@ -120,8 +134,8 @@ static int create(struct gl_hwdec *hw)
         destroy(hw);
         return -1;
     }
-    hw->hwctx = &p->ctx->hwctx;
-    hw->converted_imgfmt = IMGFMT_RGB0;
+    p->ctx->hwctx.driver_name = hw->driver->name;
+    hwdec_devices_add(hw->devs, &p->ctx->hwctx);
     return 0;
 }
 
@@ -144,39 +158,50 @@ static int reinit(struct gl_hwdec *hw, struct mp_image_params *params)
 
     p->vdpgl_initialized = true;
 
-    vdp_st = vdp->output_surface_create(p->ctx->vdp_device,
-                                        VDP_RGBA_FORMAT_B8G8R8A8,
-                                        params->w, params->h, &p->vdp_surface);
-    CHECK_VDP_ERROR(p, "Error when calling vdp_output_surface_create");
-
-    gl->GenTextures(1, &p->gl_texture);
-    gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    p->direct_mode = params->hw_subfmt == IMGFMT_NV12;
+
+    gl->GenTextures(4, p->gl_textures);
+    for (int n = 0; n < 4; n++) {
+        gl->BindTexture(GL_TEXTURE_2D, p->gl_textures[n]);
+        GLenum filter = p->direct_mode ? GL_NEAREST : GL_LINEAR;
+        gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, filter);
+        gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, filter);
+        gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+        gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
+    }
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
-    p->vdpgl_surface = gl->VDPAURegisterOutputSurfaceNV(BRAINDEATH(p->vdp_surface),
-                                                        GL_TEXTURE_2D,
-                                                        1, &p->gl_texture);
-    if (!p->vdpgl_surface)
-        return -1;
+    if (p->direct_mode) {
+        params->imgfmt = IMGFMT_NV12;
+    } else {
+        vdp_st = vdp->output_surface_create(p->ctx->vdp_device,
+                                            VDP_RGBA_FORMAT_B8G8R8A8,
+                                            params->w, params->h, &p->vdp_surface);
+        CHECK_VDP_ERROR(p, "Error when calling vdp_output_surface_create");
+
+        p->vdpgl_surface = gl->VDPAURegisterOutputSurfaceNV(BRAINDEATH(p->vdp_surface),
+                                                            GL_TEXTURE_2D,
+                                                            1, p->gl_textures);
+        if (!p->vdpgl_surface)
+            return -1;
 
-    gl->VDPAUSurfaceAccessNV(p->vdpgl_surface, GL_READ_ONLY);
+        gl->VDPAUSurfaceAccessNV(p->vdpgl_surface, GL_READ_ONLY);
+
+        params->imgfmt = IMGFMT_RGB0;
+    }
 
-    glCheckError(gl, hw->log, "After initializing vdpau OpenGL interop");
+    gl_check_error(gl, hw->log, "After initializing vdpau OpenGL interop");
 
     return 0;
 }
 
-static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
-                     GLuint *out_textures)
+static int map_frame(struct gl_hwdec *hw, struct mp_image *hw_image,
+                     struct gl_hwdec_frame *out_frame)
 {
     struct priv *p = hw->priv;
     GL *gl = hw->gl;
-
-    assert(hw_image && hw_image->imgfmt == IMGFMT_VDPAU);
+    struct vdp_functions *vdp = &p->ctx->vdp;
+    VdpStatus vdp_st;
 
     int pe = mp_vdpau_handle_preemption(p->ctx, &p->preemption_counter);
     if (pe < 1) {
@@ -187,17 +212,58 @@ static int map_image(struct gl_hwdec *hw, struct mp_image *hw_image,
             return -1;
     }
 
-    if (!p->vdpgl_surface)
-        return -1;
+    if (p->direct_mode) {
+        VdpVideoSurface surface = (intptr_t)hw_image->planes[3];
 
-    if (p->mapped)
-        gl->VDPAUUnmapSurfacesNV(1, &p->vdpgl_surface);
+        // We need the uncropped size.
+        VdpChromaType s_chroma_type;
+        uint32_t s_w, s_h;
+        vdp_st = vdp->video_surface_get_parameters(surface, &s_chroma_type, &s_w, &s_h);
+        CHECK_VDP_ERROR(hw, "Error when calling vdp_video_surface_get_parameters");
+
+        p->vdpgl_surface = gl->VDPAURegisterVideoSurfaceNV(BRAINDEATH(surface),
+                                                           GL_TEXTURE_2D,
+                                                           4, p->gl_textures);
+        if (!p->vdpgl_surface)
+            return -1;
 
-    mp_vdpau_mixer_render(p->mixer, NULL, p->vdp_surface, NULL, hw_image, NULL);
+        gl->VDPAUSurfaceAccessNV(p->vdpgl_surface, GL_READ_ONLY);
+        gl->VDPAUMapSurfacesNV(1, &p->vdpgl_surface);
+
+        p->mapped = true;
+        *out_frame = (struct gl_hwdec_frame){
+            .vdpau_fields = true,
+        };
+        for (int n = 0; n < 4; n++) {
+            bool chroma = n >= 2;
+            out_frame->planes[n] = (struct gl_hwdec_plane){
+                .gl_texture = p->gl_textures[n],
+                .gl_target = GL_TEXTURE_2D,
+                .tex_w = s_w / (chroma ? 2 : 1),
+                .tex_h = s_h / (chroma ? 4 : 2),
+            };
+        };
+    } else {
+        if (!p->vdpgl_surface)
+            return -1;
+
+        mp_vdpau_mixer_render(p->mixer, NULL, p->vdp_surface, NULL, hw_image, NULL);
+
+        gl->VDPAUMapSurfacesNV(1, &p->vdpgl_surface);
+
+        p->mapped = true;
+        *out_frame = (struct gl_hwdec_frame){
+            .planes = {
+                {
+                    .gl_texture = p->gl_textures[0],
+                    .gl_target = GL_TEXTURE_2D,
+                    .tex_w = p->image_params.w,
+                    .tex_h = p->image_params.h,
+                },
+            },
+        };
+    }
 
-    gl->VDPAUMapSurfacesNV(1, &p->vdpgl_surface);
-    p->mapped = true;
-    out_textures[0] = p->gl_texture;
     return 0;
 }
 
@@ -207,6 +273,7 @@ const struct gl_hwdec_driver gl_hwdec_vdpau = {
     .imgfmt = IMGFMT_VDPAU,
     .create = create,
     .reinit = reinit,
-    .map_image = map_image,
+    .map_frame = map_frame,
+    .unmap = unmap,
     .destroy = destroy,
 };
diff --git a/video/out/opengl/lcms.c b/video/out/opengl/lcms.c
index 7db8da6..eaeb86f 100644
--- a/video/out/opengl/lcms.c
+++ b/video/out/opengl/lcms.c
@@ -16,6 +16,7 @@
  */
 
 #include <string.h>
+#include <math.h>
 
 #include "mpv_talloc.h"
 
@@ -25,10 +26,10 @@
 #include "common/common.h"
 #include "misc/bstr.h"
 #include "common/msg.h"
+#include "options/m_config.h"
 #include "options/m_option.h"
 #include "options/path.h"
-
-#include "video.h"
+#include "video/csputils.h"
 #include "lcms.h"
 
 #include "osdep/io.h"
@@ -42,14 +43,14 @@
 struct gl_lcms {
     void *icc_data;
     size_t icc_size;
-    char *icc_path;
+    bool using_memory_profile;
     bool changed;
     enum mp_csp_prim prev_prim;
     enum mp_csp_trc prev_trc;
 
     struct mp_log *log;
     struct mpv_global *global;
-    struct mp_icc_opts opts;
+    struct mp_icc_opts *opts;
 };
 
 static bool parse_3dlut_size(const char *arg, int *p1, int *p2, int *p3)
@@ -80,6 +81,7 @@ const struct m_sub_options mp_icc_conf = {
         OPT_FLAG("icc-profile-auto", profile_auto, 0),
         OPT_STRING("icc-cache-dir", cache_dir, 0),
         OPT_INT("icc-intent", intent, 0),
+        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
         OPT_STRING_VALIDATE("3dlut-size", size_str, 0, validate_3dlut_size_opt),
 
         OPT_REMOVED("icc-cache", "see icc-cache-dir"),
@@ -99,25 +101,28 @@ static void lcms2_error_handler(cmsContext ctx, cmsUInt32Number code,
     MP_ERR(p, "lcms2: %s\n", msg);
 }
 
-static bool load_profile(struct gl_lcms *p)
+static void load_profile(struct gl_lcms *p)
 {
-    if (p->icc_data && p->icc_size)
-        return true;
+    talloc_free(p->icc_data);
+    p->icc_data = NULL;
+    p->icc_size = 0;
+    p->using_memory_profile = false;
 
-    if (!p->icc_path)
-        return false;
+    if (!p->opts->profile || !p->opts->profile[0])
+        return;
 
-    char *fname = mp_get_user_path(NULL, p->global, p->icc_path);
+    char *fname = mp_get_user_path(NULL, p->global, p->opts->profile);
     MP_VERBOSE(p, "Opening ICC profile '%s'\n", fname);
     struct bstr iccdata = stream_read_file(fname, p, p->global,
                                            100000000); // 100 MB
     talloc_free(fname);
     if (!iccdata.len)
-        return false;
+        return;
+
+    talloc_free(p->icc_data);
 
     p->icc_data = iccdata.start;
     p->icc_size = iccdata.len;
-    return true;
 }
 
 struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
@@ -128,44 +133,55 @@ struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
         .global = global,
         .log = log,
         .changed = true,
+        .opts = m_sub_options_copy(p, &mp_icc_conf, mp_icc_conf.defaults),
     };
     return p;
 }
 
 void gl_lcms_set_options(struct gl_lcms *p, struct mp_icc_opts *opts)
 {
-    p->opts = *opts;
-    p->icc_path = talloc_strdup(p, p->opts.profile);
-    load_profile(p);
+    struct mp_icc_opts *old_opts = p->opts;
+    p->opts = m_sub_options_copy(p, &mp_icc_conf, opts);
+
+    if ((p->using_memory_profile && !p->opts->profile_auto) ||
+        !bstr_equals(bstr0(p->opts->profile), bstr0(old_opts->profile)))
+    {
+        load_profile(p);
+    }
+
     p->changed = true; // probably
+
+    talloc_free(old_opts);
 }
 
 // Warning: profile.start must point to a ta allocation, and the function
 //          takes over ownership.
-void gl_lcms_set_memory_profile(struct gl_lcms *p, bstr *profile)
+// Returns whether the internal profile was changed.
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile)
 {
-    if (!p->opts.profile_auto) {
-        talloc_free(profile->start);
-        return;
+    if (!p->opts->profile_auto || (p->opts->profile && p->opts->profile[0])) {
+        talloc_free(profile.start);
+        return false;
     }
 
-    if (!p->icc_path && p->icc_data && profile->start &&
-        profile->len == p->icc_size &&
-        memcmp(profile->start, p->icc_data, p->icc_size) == 0)
+    if (p->using_memory_profile &&
+        p->icc_data && profile.start &&
+        profile.len == p->icc_size &&
+        memcmp(profile.start, p->icc_data, p->icc_size) == 0)
     {
-        talloc_free(profile->start);
-        return;
+        talloc_free(profile.start);
+        return false;
     }
 
     p->changed = true;
-
-    talloc_free(p->icc_path);
-    p->icc_path = NULL;
+    p->using_memory_profile = true;
 
     talloc_free(p->icc_data);
 
-    p->icc_data = talloc_steal(p, profile->start);
-    p->icc_size = profile->len;
+    p->icc_data = talloc_steal(p, profile.start);
+    p->icc_size = profile.len;
+
+    return true;
 }
 
 // Return and _reset_ whether the profile or config has changed since the last
@@ -180,7 +196,15 @@ bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
     return change;
 }
 
-static cmsHPROFILE get_vid_profile(cmsContext cms, cmsHPROFILE disp_profile,
+// Whether a profile is set. (gl_lcms_get_lut3d() is expected to return a lut,
+// but it could still fail due to runtime errors, such as invalid icc data.)
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return p->icc_size > 0;
+}
+
+static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
+                                   cmsHPROFILE disp_profile,
                                    enum mp_csp_prim prim, enum mp_csp_trc trc)
 {
     // The input profile for the transformation is dependent on the video
@@ -213,21 +237,47 @@ static cmsHPROFILE get_vid_profile(cmsContext cms, cmsHPROFILE disp_profile,
 
     case MP_CSP_TRC_BT_1886: {
         // To build an appropriate BT.1886 transformation we need access to
-        // the display's black point, so we use the reverse mappings
+        // the display's black point, so we LittleCMS' detection function.
+        // Relative colorimetric is used since we want to approximate the
+        // BT.1886 to the target device's actual black point even in e.g.
+        // perceptual mode
+        const int intent = MP_INTENT_RELATIVE_COLORIMETRIC;
+        cmsCIEXYZ bp_XYZ;
+        if (!cmsDetectBlackPoint(&bp_XYZ, disp_profile, intent, 0))
+            return false;
+
+        // Map this XYZ value back into the (linear) source space
         cmsToneCurve *linear = cmsBuildGamma(cms, 1.0);
         cmsHPROFILE rev_profile = cmsCreateRGBProfileTHR(cms, &wp_xyY, &prim_xyY,
                 (cmsToneCurve*[3]){linear, linear, linear});
-        cmsHTRANSFORM disp2src = cmsCreateTransformTHR(cms,
-                disp_profile, TYPE_RGB_16, rev_profile, TYPE_RGB_DBL,
-                INTENT_RELATIVE_COLORIMETRIC, 0);
+        cmsHPROFILE xyz_profile = cmsCreateXYZProfile();
+        cmsHTRANSFORM xyz2src = cmsCreateTransformTHR(cms,
+                xyz_profile, TYPE_XYZ_DBL, rev_profile, TYPE_RGB_DBL,
+                intent, 0);
         cmsFreeToneCurve(linear);
         cmsCloseProfile(rev_profile);
-        if (!disp2src)
+        cmsCloseProfile(xyz_profile);
+        if (!xyz2src)
             return false;
 
-        uint64_t disp_black[3] = {0};
         double src_black[3];
-        cmsDoTransform(disp2src, disp_black, src_black, 1);
+        cmsDoTransform(xyz2src, &bp_XYZ, src_black, 1);
+        cmsDeleteTransform(xyz2src);
+
+        // Contrast limiting
+        if (p->opts->contrast > 0) {
+            for (int i = 0; i < 3; i++)
+                src_black[i] = MPMAX(src_black[i], 1.0 / p->opts->contrast);
+        }
+
+        // Built-in contrast failsafe
+        double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
+        if (contrast > 100000) {
+            MP_WARN(p, "ICC profile detected contrast very high (>100000),"
+                    " falling back to contrast 1000 for sanity. Set the"
+                    " icc-contrast option to silence this warning.\n");
+            src_black[0] = src_black[1] = src_black[2] = 1.0 / 1000;
+        }
 
         // Build the parametric BT.1886 transfer curve, one per channel
         for (int i = 0; i < 3; i++) {
@@ -265,10 +315,10 @@ bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
     int s_r, s_g, s_b;
     bool result = false;
 
-    if (!parse_3dlut_size(p->opts.size_str, &s_r, &s_g, &s_b))
+    if (!parse_3dlut_size(p->opts->size_str, &s_r, &s_g, &s_b))
         return false;
 
-    if (!p->icc_data && !p->icc_path)
+    if (!gl_lcms_has_profile(p))
         return false;
 
     void *tmp = talloc_new(NULL);
@@ -277,13 +327,14 @@ bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
     cmsContext cms = NULL;
 
     char *cache_file = NULL;
-    if (p->opts.cache_dir && p->opts.cache_dir[0]) {
+    if (p->opts->cache_dir && p->opts->cache_dir[0]) {
         // Gamma is included in the header to help uniquely identify it,
         // because we may change the parameter in the future or make it
         // customizable, same for the primaries.
         char *cache_info = talloc_asprintf(tmp,
-                "ver=1.3, intent=%d, size=%dx%dx%d, prim=%d, trc=%d\n",
-                p->opts.intent, s_r, s_g, s_b, prim, trc);
+                "ver=1.3, intent=%d, size=%dx%dx%d, prim=%d, trc=%d, "
+                "contrast=%d\n",
+                p->opts->intent, s_r, s_g, s_b, prim, trc, p->opts->contrast);
 
         uint8_t hash[32];
         struct AVSHA *sha = av_sha_alloc();
@@ -295,7 +346,7 @@ bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
         av_sha_final(sha, hash);
         av_free(sha);
 
-        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts.cache_dir);
+        char *cache_dir = mp_get_user_path(tmp, p->global, p->opts->cache_dir);
         cache_file = talloc_strdup(tmp, "");
         for (int i = 0; i < sizeof(hash); i++)
             cache_file = talloc_asprintf_append(cache_file, "%02X", hash[i]);
@@ -305,7 +356,7 @@ bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
     }
 
     // check cache
-    if (cache_file) {
+    if (cache_file && stat(cache_file, &(struct stat){0}) == 0) {
         MP_VERBOSE(p, "Opening 3D LUT cache in file '%s'.\n", cache_file);
         struct bstr cachedata = stream_read_file(cache_file, tmp, p->global,
                                                  1000000000); // 1 GB
@@ -327,7 +378,7 @@ bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
     if (!profile)
         goto error_exit;
 
-    cmsHPROFILE vid_profile = get_vid_profile(cms, profile, prim, trc);
+    cmsHPROFILE vid_profile = get_vid_profile(p, cms, profile, prim, trc);
     if (!vid_profile) {
         cmsCloseProfile(profile);
         goto error_exit;
@@ -335,8 +386,9 @@ bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
 
     cmsHTRANSFORM trafo = cmsCreateTransformTHR(cms, vid_profile, TYPE_RGB_16,
                                                 profile, TYPE_RGB_16,
-                                                p->opts.intent,
-                                                cmsFLAGS_HIGHRESPRECALC);
+                                                p->opts->intent,
+                                                cmsFLAGS_HIGHRESPRECALC |
+                                                cmsFLAGS_BLACKPOINTCOMPENSATION);
     cmsCloseProfile(profile);
     cmsCloseProfile(vid_profile);
 
@@ -406,7 +458,7 @@ struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
 }
 
 void gl_lcms_set_options(struct gl_lcms *p, struct mp_icc_opts *opts) { }
-void gl_lcms_set_memory_profile(struct gl_lcms *p, bstr *profile) { }
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile) {return false;}
 
 bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
                          enum mp_csp_trc trc)
@@ -414,6 +466,11 @@ bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
     return false;
 }
 
+bool gl_lcms_has_profile(struct gl_lcms *p)
+{
+    return false;
+}
+
 bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **result_lut3d,
                        enum mp_csp_prim prim, enum mp_csp_trc trc)
 {
diff --git a/video/out/opengl/lcms.h b/video/out/opengl/lcms.h
index ee2a48b..094514a 100644
--- a/video/out/opengl/lcms.h
+++ b/video/out/opengl/lcms.h
@@ -13,9 +13,14 @@ struct mp_icc_opts {
     char *cache_dir;
     char *size_str;
     int intent;
+    int contrast;
+};
+
+struct lut3d {
+    uint16_t *data;
+    int size[3];
 };
 
-struct lut3d;
 struct mp_log;
 struct mpv_global;
 struct gl_lcms;
@@ -23,7 +28,8 @@ struct gl_lcms;
 struct gl_lcms *gl_lcms_init(void *talloc_ctx, struct mp_log *log,
                              struct mpv_global *global);
 void gl_lcms_set_options(struct gl_lcms *p, struct mp_icc_opts *opts);
-void gl_lcms_set_memory_profile(struct gl_lcms *p, bstr *profile);
+bool gl_lcms_set_memory_profile(struct gl_lcms *p, bstr profile);
+bool gl_lcms_has_profile(struct gl_lcms *p);
 bool gl_lcms_get_lut3d(struct gl_lcms *p, struct lut3d **,
                        enum mp_csp_prim prim, enum mp_csp_trc trc);
 bool gl_lcms_has_changed(struct gl_lcms *p, enum mp_csp_prim prim,
diff --git a/video/out/opengl/nnedi3.c b/video/out/opengl/nnedi3.c
deleted file mode 100644
index 3c12fcc..0000000
--- a/video/out/opengl/nnedi3.c
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- *
- * The shader portions may have been derived from existing LGPLv3 shaders
- * (see below), possibly making this file effectively LGPLv3.
- */
-
-#include "nnedi3.h"
-
-#if HAVE_NNEDI
-
-#include <assert.h>
-#include <stdint.h>
-#include <float.h>
-
-#include <libavutil/bswap.h>
-
-#include "video.h"
-
-/*
- * NNEDI3, an intra-field deinterlacer
- *
- * The original filter was authored by Kevin Stone (aka. tritical) and is
- * licensed under GPL2 terms:
- *     http://bengal.missouri.edu/~kes25c/
- *
- * A LGPLv3 licensed OpenCL kernel was created by SEt:
- *     http://forum.doom9.org/showthread.php?t=169766
- *
- * A HLSL port further modified by madshi, Shiandow and Zach Saw could be
- * found at (also LGPLv3 licensed):
- *     https://github.com/zachsaw/MPDN_Extensions
- *
- */
-
-#define GLSL(x) gl_sc_add(sc, #x "\n");
-#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
-#define GLSLH(x) gl_sc_hadd(sc, #x "\n");
-#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__)
-
-const struct nnedi3_opts nnedi3_opts_def = {
-    .neurons = 1,
-    .window = 0,
-    .upload = NNEDI3_UPLOAD_UBO,
-};
-
-#define OPT_BASE_STRUCT struct nnedi3_opts
-const struct m_sub_options nnedi3_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_CHOICE("neurons", neurons, 0,
-                   ({"16", 0},
-                    {"32", 1},
-                    {"64", 2},
-                    {"128", 3})),
-        OPT_CHOICE("window", window, 0,
-                   ({"8x4", 0},
-                    {"8x6", 1})),
-        OPT_CHOICE("upload", upload, 0,
-                   ({"ubo", NNEDI3_UPLOAD_UBO},
-                    {"shader", NNEDI3_UPLOAD_SHADER})),
-        {0}
-    },
-    .size = sizeof(struct nnedi3_opts),
-    .defaults = &nnedi3_opts_def,
-};
-
-const static char nnedi3_weights[40320 * 4 + 1] =
-#include "video/out/opengl/nnedi3_weights.inc"
-;
-
-const int nnedi3_weight_offsets[9] =
-    {0, 1088, 3264, 7616, 16320, 17920, 21120, 27520, 40320};
-
-const int nnedi3_neurons[4] = {16, 32, 64, 128};
-const int nnedi3_window_width[2] = {8, 8};
-const int nnedi3_window_height[2] = {4, 6};
-
-const float* get_nnedi3_weights(const struct nnedi3_opts *conf, int *size)
-{
-    int idx = conf->window * 4 + conf->neurons;
-    const int offset = nnedi3_weight_offsets[idx];
-    *size = (nnedi3_weight_offsets[idx + 1] - offset) * 4;
-    return (const float*)(nnedi3_weights + offset * 4);
-}
-
-void pass_nnedi3(GL *gl, struct gl_shader_cache *sc, int planes, int tex_num,
-                 int step, float tex_mul, const struct nnedi3_opts *conf,
-                 struct gl_transform *transform, GLenum tex_target)
-{
-    assert(0 <= step && step < 2);
-
-    if (!conf)
-        conf = &nnedi3_opts_def;
-
-    const int neurons = nnedi3_neurons[conf->neurons];
-    const int width = nnedi3_window_width[conf->window];
-    const int height = nnedi3_window_height[conf->window];
-
-    const int offset = nnedi3_weight_offsets[conf->window * 4 + conf->neurons];
-    const uint32_t *weights = (const int*)(nnedi3_weights + offset * 4);
-
-    GLSLF("// nnedi3 (step %d, neurons %d, window %dx%d, mode %d)\n",
-          step, neurons, width, height, conf->upload);
-
-    // This is required since each row will be encoded into vec4s
-    assert(width % 4 == 0);
-    const int sample_count = width * height / 4;
-
-    if (conf->upload == NNEDI3_UPLOAD_UBO) {
-        char buf[32];
-        snprintf(buf, sizeof(buf), "vec4 weights[%d];",
-                 neurons * (sample_count * 2 + 1));
-        gl_sc_uniform_buffer(sc, "NNEDI3_WEIGHTS", buf, 0);
-        if (!gl->es && gl->glsl_version < 140)
-            gl_sc_enable_extension(sc, "GL_ARB_uniform_buffer_object");
-    } else if (conf->upload == NNEDI3_UPLOAD_SHADER) {
-        // Somehow necessary for hard coding approach.
-        GLSLH(#pragma optionNV(fastprecision on))
-    }
-
-    GLSLHF("float nnedi3(%s tex, vec2 pos, vec2 tex_size, vec2 pixel_size, int plane, float tex_mul) {\n", mp_sampler_type(tex_target));
-
-    if (step == 0) {
-        *transform = (struct gl_transform){{{1.0,0.0}, {0.0,2.0}}, {0.0,-0.5}};
-
-        GLSLH(if (fract(pos.y * tex_size.y) < 0.5)
-                  return texture(tex, pos + vec2(0, 0.25) * pixel_size)[plane] * tex_mul;)
-        GLSLHF("#define GET(i, j) "
-               "(texture(tex, pos+vec2((i)-(%f),(j)-(%f)+0.25) * pixel_size)[plane]*tex_mul)\n",
-               width / 2.0 - 1, (height - 1) / 2.0);
-    } else {
-        *transform = (struct gl_transform){{{2.0,0.0}, {0.0,1.0}}, {-0.5,0.0}};
-
-        GLSLH(if (fract(pos.x * tex_size.x) < 0.5)
-                  return texture(tex, pos + vec2(0.25, 0) * pixel_size)[plane] * tex_mul;)
-        GLSLHF("#define GET(i, j) "
-               "(texture(tex, pos+vec2((j)-(%f)+0.25,(i)-(%f)) * pixel_size)[plane]*tex_mul)\n",
-               (height - 1) / 2.0, width / 2.0 - 1);
-    }
-
-    GLSLHF("vec4 samples[%d];\n", sample_count);
-
-    for (int y = 0; y < height; y++)
-        for (int x = 0; x < width; x += 4) {
-            GLSLHF("samples[%d] = vec4(GET(%d.0, %d.0), GET(%d.0, %d.0),"
-                                      "GET(%d.0, %d.0), GET(%d.0, %d.0));\n",
-                   (y * width + x) / 4, x, y, x+1, y, x+2, y, x+3, y);
-        }
-
-    GLSLHF("float sum = 0.0, sumsq = 0.0;"
-           "for (int i = 0; i < %d; i++) {"
-               "sum += dot(samples[i], vec4(1.0));"
-               "sumsq += dot(samples[i], samples[i]);"
-           "}\n", sample_count);
-
-    GLSLHF("float mstd0 = sum / %d.0;\n"
-           "float mstd1 = sumsq / %d.0 - mstd0 * mstd0;\n"
-           "float mstd2 = mix(0.0, inversesqrt(mstd1), mstd1 >= %.12e);\n"
-           "mstd1 *= mstd2;\n",
-           width * height, width * height, FLT_EPSILON);
-
-    GLSLHF("float vsum = 0.0, wsum = 0.0, sum1, sum2;\n");
-
-    if (conf->upload == NNEDI3_UPLOAD_SHADER) {
-        GLSLH(#define T(x) intBitsToFloat(x))
-        GLSLH(#define W(i,w0,w1,w2,w3) dot(samples[i],vec4(T(w0),T(w1),T(w2),T(w3))))
-
-        GLSLHF("#define WS(w0,w1) "
-               "sum1 = exp(sum1 * mstd2 + T(w0));"
-               "sum2 = sum2 * mstd2 + T(w1);"
-               "wsum += sum1;"
-               "vsum += sum1*(sum2/(1.0+abs(sum2)));\n");
-
-        for (int n = 0; n < neurons; n++) {
-            const uint32_t *weights_ptr = weights + (sample_count * 2 + 1) * 4 * n;
-            for (int s = 0; s < 2; s++) {
-                GLSLHF("sum%d", s + 1);
-                for (int i = 0; i < sample_count; i++) {
-                    GLSLHF("%cW(%d,%d,%d,%d,%d)", i == 0 ? '=' : '+', i,
-                           (int)av_le2ne32(weights_ptr[0]),
-                           (int)av_le2ne32(weights_ptr[1]),
-                           (int)av_le2ne32(weights_ptr[2]),
-                           (int)av_le2ne32(weights_ptr[3]));
-                    weights_ptr += 4;
-                }
-                GLSLHF(";");
-            }
-            GLSLHF("WS(%d,%d);\n", (int)av_le2ne32(weights_ptr[0]),
-                                   (int)av_le2ne32(weights_ptr[1]));
-        }
-    } else if (conf->upload == NNEDI3_UPLOAD_UBO) {
-        GLSLH(int idx = 0;)
-
-        GLSLHF("for (int n = 0; n < %d; n++) {\n", neurons);
-
-        for (int s = 0; s < 2; s++) {
-            GLSLHF("sum%d = 0.0;\n"
-                   "for (int i = 0; i < %d; i++) {"
-                       "sum%d += dot(samples[i], weights[idx++]);"
-                   "}\n",
-                   s + 1, sample_count, s + 1);
-        }
-
-        GLSLH(sum1 = exp(sum1 * mstd2 + weights[idx][0]);
-              sum2 = sum2 * mstd2 + weights[idx++][1];
-              wsum += sum1;
-              vsum += sum1*(sum2/(1.0+abs(sum2)));)
-
-        GLSLHF("}\n");
-    }
-
-    GLSLH(return clamp(mstd0 + 5.0 * vsum / wsum * mstd1, 0.0, 1.0);)
-
-    GLSLHF("}\n"); // nnedi3
-
-    GLSL(color = vec4(1.0);)
-
-    for (int i = 0; i < planes; i++) {
-        GLSLF("color[%d] = nnedi3(texture%d, texcoord%d, texture_size%d, pixel_size%d, %d, %f);\n",
-              i, tex_num, tex_num, tex_num, tex_num, i, tex_mul);
-    }
-}
-
-#else
-
-const struct m_sub_options nnedi3_conf = {0};
-
-
-const float* get_nnedi3_weights(const struct nnedi3_opts *conf, int *size)
-{
-    return NULL;
-}
-
-void pass_nnedi3(GL *gl, struct gl_shader_cache *sc, int planes, int tex_num,
-                 int step, float tex_mul, const struct nnedi3_opts *conf,
-                 struct gl_transform *transform, GLenum tex_target)
-{
-}
-
-#endif
diff --git a/video/out/opengl/nnedi3.h b/video/out/opengl/nnedi3.h
deleted file mode 100644
index c3895a0..0000000
--- a/video/out/opengl/nnedi3.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_NNEDI3_H
-#define MP_GL_NNEDI3_H
-
-#include "config.h"
-#include "common.h"
-#include "utils.h"
-
-#define HAVE_NNEDI HAVE_GPL3
-
-#define NNEDI3_UPLOAD_UBO 0
-#define NNEDI3_UPLOAD_SHADER 1
-
-struct nnedi3_opts {
-    int neurons;
-    int window;
-    int upload;
-};
-
-extern const struct nnedi3_opts nnedi3_opts_def;
-extern const struct m_sub_options nnedi3_conf;
-
-const float* get_nnedi3_weights(const struct nnedi3_opts *conf, int *size);
-
-void pass_nnedi3(GL *gl, struct gl_shader_cache *sc, int planes, int tex_num,
-                 int step, float tex_mul, const struct nnedi3_opts *conf,
-                 struct gl_transform *transform, GLenum tex_target);
-
-#endif
diff --git a/video/out/opengl/nnedi3_weights.bin b/video/out/opengl/nnedi3_weights.bin
deleted file mode 100644
index e1659d8..0000000
--- a/video/out/opengl/nnedi3_weights.bin
+++ /dev/null
diff --git a/video/out/opengl/osd.c b/video/out/opengl/osd.c
index c554425..7b1ec16 100644
--- a/video/out/opengl/osd.c
+++ b/video/out/opengl/osd.c
@@ -17,19 +17,16 @@
 
 #include <stdlib.h>
 #include <assert.h>
+#include <limits.h>
+
 #include <libavutil/common.h>
 
 #include "video/out/bitmap_packer.h"
 
+#include "formats.h"
 #include "utils.h"
 #include "osd.h"
 
-struct osd_fmt_entry {
-    GLint internal_format;
-    GLint format;
-    GLenum type;
-};
-
 // glBlendFuncSeparate() arguments
 static const int blend_factors[SUBBITMAP_COUNT][4] = {
     [SUBBITMAP_LIBASS] = {GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA,
@@ -38,21 +35,6 @@ static const int blend_factors[SUBBITMAP_COUNT][4] = {
                           GL_ONE,       GL_ONE_MINUS_SRC_ALPHA},
 };
 
-static const struct osd_fmt_entry osd_to_gl3_formats[SUBBITMAP_COUNT] = {
-    [SUBBITMAP_LIBASS] = {GL_RED,   GL_RED,   GL_UNSIGNED_BYTE},
-    [SUBBITMAP_RGBA] =   {GL_RGBA,  GL_RGBA,  GL_UNSIGNED_BYTE},
-};
-
-static const struct osd_fmt_entry osd_to_gles3_formats[SUBBITMAP_COUNT] = {
-    [SUBBITMAP_LIBASS] = {GL_R8,    GL_RED,   GL_UNSIGNED_BYTE},
-    [SUBBITMAP_RGBA] =   {GL_RGBA8, GL_RGBA,  GL_UNSIGNED_BYTE},
-};
-
-static const struct osd_fmt_entry osd_to_gl2_formats[SUBBITMAP_COUNT] = {
-    [SUBBITMAP_LIBASS] = {GL_LUMINANCE, GL_LUMINANCE,   GL_UNSIGNED_BYTE},
-    [SUBBITMAP_RGBA] =   {GL_RGBA,      GL_RGBA,        GL_UNSIGNED_BYTE},
-};
-
 struct vertex {
     float position[2];
     float texcoord[2];
@@ -77,16 +59,17 @@ struct mpgl_osd_part {
     struct sub_bitmap *subparts;
     struct vertex *vertices;
     struct bitmap_packer *packer;
+    void *upload;
 };
 
 struct mpgl_osd {
     struct mp_log *log;
     struct osd_state *osd;
     GL *gl;
+    GLint max_tex_wh;
     bool use_pbo;
-    bool scaled;
     struct mpgl_osd_part *parts[MAX_OSD_PARTS];
-    const struct osd_fmt_entry *fmt_table;
+    const struct gl_format *fmt_table[SUBBITMAP_COUNT];
     bool formats[SUBBITMAP_COUNT];
     struct gl_vao vao;
     int64_t change_counter;
@@ -98,37 +81,32 @@ struct mpgl_osd {
 
 struct mpgl_osd *mpgl_osd_init(GL *gl, struct mp_log *log, struct osd_state *osd)
 {
-    GLint max_texture_size;
-    gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &max_texture_size);
-
     struct mpgl_osd *ctx = talloc_ptrtype(NULL, ctx);
     *ctx = (struct mpgl_osd) {
         .log = log,
         .osd = osd,
         .gl = gl,
-        .fmt_table = osd_to_gl3_formats,
         .scratch = talloc_zero_size(ctx, 1),
     };
 
-    if (gl->es >= 300) {
-        ctx->fmt_table = osd_to_gles3_formats;
-    } else if (!(gl->mpgl_caps & MPGL_CAP_TEX_RG)) {
-        ctx->fmt_table = osd_to_gl2_formats;
-    }
+    gl->GetIntegerv(GL_MAX_TEXTURE_SIZE, &ctx->max_tex_wh);
+
+    ctx->fmt_table[SUBBITMAP_LIBASS] = gl_find_unorm_format(gl, 1, 1);
+    ctx->fmt_table[SUBBITMAP_RGBA]   = gl_find_unorm_format(gl, 1, 4);
 
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
         struct mpgl_osd_part *p = talloc_ptrtype(ctx, p);
         *p = (struct mpgl_osd_part) {
             .packer = talloc_struct(p, struct bitmap_packer, {
-                .w_max = max_texture_size,
-                .h_max = max_texture_size,
+                .w_max = ctx->max_tex_wh,
+                .h_max = ctx->max_tex_wh,
             }),
         };
         ctx->parts[n] = p;
     }
 
     for (int n = 0; n < SUBBITMAP_COUNT; n++)
-        ctx->formats[n] = ctx->fmt_table[n].type != 0;
+        ctx->formats[n] = !!ctx->fmt_table[n];
 
     gl_vao_init(&ctx->vao, gl, sizeof(struct vertex), vertex_vao);
 
@@ -149,6 +127,7 @@ void mpgl_osd_destroy(struct mpgl_osd *ctx)
         gl->DeleteTextures(1, &p->texture);
         if (gl->DeleteBuffers)
             gl->DeleteBuffers(1, &p->buffer);
+        talloc_free(p->upload);
     }
     talloc_free(ctx);
 }
@@ -158,38 +137,79 @@ void mpgl_osd_set_options(struct mpgl_osd *ctx, bool pbo)
     ctx->use_pbo = pbo;
 }
 
-static bool upload_pbo(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
-                       struct sub_bitmaps *imgs)
+static bool upload(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
+                   struct sub_bitmaps *imgs, bool pbo)
 {
     GL *gl = ctx->gl;
     bool success = true;
-    struct osd_fmt_entry fmt = ctx->fmt_table[imgs->format];
-    int pix_stride = glFmt2bpp(fmt.format, fmt.type);
+    const struct gl_format *fmt = ctx->fmt_table[imgs->format];
+    size_t pix_stride = gl_bytes_per_pixel(fmt->format, fmt->type);
+    size_t buffer_size = pix_stride * osd->h * osd->w;
+
+    char *data = NULL;
+    void *texdata = NULL;
+
+    if (pbo) {
+        if (!osd->buffer) {
+            gl->GenBuffers(1, &osd->buffer);
+            gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
+            gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size, NULL,
+                           GL_DYNAMIC_COPY);
+        }
 
-    if (!osd->buffer) {
-        gl->GenBuffers(1, &osd->buffer);
         gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
-        gl->BufferData(GL_PIXEL_UNPACK_BUFFER, osd->w * osd->h * pix_stride,
-                        NULL, GL_DYNAMIC_COPY);
-        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+        data = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, buffer_size,
+                                  GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+        if (!data) {
+            success = false;
+            goto done;
+        }
+    } else {
+        if (!imgs->packed) {
+            if (!osd->upload)
+                osd->upload = talloc_size(NULL, buffer_size);
+            data = osd->upload;
+            texdata = data;
+        }
     }
 
-    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, osd->buffer);
-    char *data = gl->MapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY);
-    if (!data) {
-        success = false;
+    int copy_w = 0;
+    int copy_h = 0;
+    size_t stride = 0;
+    if (imgs->packed) {
+        copy_w = imgs->packed_w;
+        copy_h = imgs->packed_h;
+        stride = imgs->packed->stride[0];
+        texdata = imgs->packed->planes[0];
+        if (pbo) {
+            memcpy_pic(data, texdata, pix_stride * copy_w,  copy_h,
+                       osd->w * pix_stride, stride);
+            stride = osd->w * pix_stride;
+            texdata = NULL;
+        }
     } else {
         struct pos bb[2];
         packer_get_bb(osd->packer, bb);
-        size_t stride = osd->w * pix_stride;
+        copy_w = bb[1].x;
+        copy_h = bb[1].y;
+        stride = osd->w * pix_stride;
         packer_copy_subbitmaps(osd->packer, imgs, data, pix_stride, stride);
-        if (!gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER))
+    }
+
+    if (pbo) {
+        if (!gl->UnmapBuffer(GL_PIXEL_UNPACK_BUFFER)) {
             success = false;
-        glUploadTex(gl, GL_TEXTURE_2D, fmt.format, fmt.type, NULL, stride,
-                    bb[0].x, bb[0].y, bb[1].x - bb[0].x, bb[1].y - bb[0].y, 0);
+            goto done;
+        }
     }
-    gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
+    gl_upload_tex(gl, GL_TEXTURE_2D, fmt->format, fmt->type, texdata, stride,
+                  0, 0, copy_w, copy_h);
+
+    if (pbo)
+        gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+done:
     if (!success) {
         MP_FATAL(ctx, "Error: can't upload subtitles! "
                  "Remove the 'pbo' suboption.\n");
@@ -198,24 +218,13 @@ static bool upload_pbo(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
     return success;
 }
 
-static void upload_tex(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
-                       struct sub_bitmaps *imgs)
+static int next_pow2(int v)
 {
-    struct osd_fmt_entry fmt = ctx->fmt_table[imgs->format];
-    if (osd->packer->padding) {
-        struct pos bb[2];
-        packer_get_bb(osd->packer, bb);
-        glClearTex(ctx->gl, GL_TEXTURE_2D, fmt.format, fmt.type,
-                   bb[0].x, bb[0].y, bb[1].x - bb[0].y, bb[1].y - bb[0].y,
-                   0, &ctx->scratch);
-    }
-    for (int n = 0; n < osd->packer->count; n++) {
-        struct sub_bitmap *s = &imgs->parts[n];
-        struct pos p = osd->packer->result[n];
-
-        glUploadTex(ctx->gl, GL_TEXTURE_2D, fmt.format, fmt.type,
-                    s->bitmap, s->stride, p.x, p.y, s->w, s->h, 0);
+    for (int x = 0; x < 30; x++) {
+        if ((1 << x) >= v)
+            return 1 << x;
     }
+    return INT_MAX;
 }
 
 static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
@@ -223,32 +232,46 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
 {
     GL *gl = ctx->gl;
 
-    // assume 2x2 filter on scaling
-    osd->packer->padding = ctx->scaled || imgs->scaled;
-    int r = packer_pack_from_subbitmaps(osd->packer, imgs);
-    if (r < 0) {
+    int req_w = 0;
+    int req_h = 0;
+
+    if (imgs->packed) {
+        req_w = next_pow2(imgs->packed_w);
+        req_h = next_pow2(imgs->packed_h);
+    } else {
+        // assume 2x2 filter on scaling
+        osd->packer->padding = imgs->scaled;
+        int r = packer_pack_from_subbitmaps(osd->packer, imgs);
+        if (r < 0) {
+            MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
+                "supported size %dx%d.\n", osd->packer->w_max, osd->packer->h_max);
+            return false;
+        }
+        req_w = osd->packer->w;
+        req_h = osd->packer->h;
+    }
+
+    if (req_w > ctx->max_tex_wh || req_h > ctx->max_tex_wh) {
         MP_ERR(ctx, "OSD bitmaps do not fit on a surface with the maximum "
-               "supported size %dx%d.\n", osd->packer->w_max, osd->packer->h_max);
+                "supported size %dx%d.\n", ctx->max_tex_wh, ctx->max_tex_wh);
         return false;
     }
 
-    struct osd_fmt_entry fmt = ctx->fmt_table[imgs->format];
-    assert(fmt.type != 0);
+    const struct gl_format *fmt = ctx->fmt_table[imgs->format];
+    assert(fmt);
 
     if (!osd->texture)
         gl->GenTextures(1, &osd->texture);
 
     gl->BindTexture(GL_TEXTURE_2D, osd->texture);
 
-    if (osd->packer->w > osd->w || osd->packer->h > osd->h
-        || osd->format != imgs->format)
-    {
+    if (req_w > osd->w || req_h > osd->h || osd->format != imgs->format) {
         osd->format = imgs->format;
-        osd->w = FFMAX(32, osd->packer->w);
-        osd->h = FFMAX(32, osd->packer->h);
+        osd->w = FFMAX(32, req_w);
+        osd->h = FFMAX(32, req_h);
 
-        gl->TexImage2D(GL_TEXTURE_2D, 0, fmt.internal_format, osd->w, osd->h,
-                       0, fmt.format, fmt.type, NULL);
+        gl->TexImage2D(GL_TEXTURE_2D, 0, fmt->internal_format, osd->w, osd->h,
+                       0, fmt->format, fmt->type, NULL);
 
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
@@ -258,13 +281,16 @@ static bool upload_osd(struct mpgl_osd *ctx, struct mpgl_osd_part *osd,
         if (gl->DeleteBuffers)
             gl->DeleteBuffers(1, &osd->buffer);
         osd->buffer = 0;
+
+        talloc_free(osd->upload);
+        osd->upload = NULL;
     }
 
     bool uploaded = false;
     if (ctx->use_pbo)
-        uploaded = upload_pbo(ctx, osd, imgs);
+        uploaded = upload(ctx, osd, imgs, true);
     if (!uploaded)
-        upload_tex(ctx, osd, imgs);
+        upload(ctx, osd, imgs, false);
 
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
@@ -280,18 +306,26 @@ static void gen_osd_cb(void *pctx, struct sub_bitmaps *imgs)
 
     struct mpgl_osd_part *osd = ctx->parts[imgs->render_index];
 
+    bool ok = true;
     if (imgs->change_id != osd->change_id) {
         if (!upload_osd(ctx, osd, imgs))
-            osd->packer->count = 0;
+            ok = false;
 
         osd->change_id = imgs->change_id;
         ctx->change_counter += 1;
     }
-    osd->num_subparts = osd->packer->count;
+    osd->num_subparts = ok ? imgs->num_parts : 0;
 
     MP_TARRAY_GROW(osd, osd->subparts, osd->num_subparts);
     memcpy(osd->subparts, imgs->parts,
            osd->num_subparts * sizeof(osd->subparts[0]));
+
+    if (!imgs->packed) {
+        for (int n = 0; n < osd->num_subparts; n++) {
+            osd->subparts[n].src_x = osd->packer->result[n].x;
+            osd->subparts[n].src_y = osd->packer->result[n].y;
+        }
+    }
 }
 
 static void write_quad(struct vertex *va, struct gl_transform t,
@@ -319,7 +353,6 @@ static int generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
 
     for (int n = 0; n < part->num_subparts; n++) {
         struct sub_bitmap *b = &part->subparts[n];
-        struct pos pos = part->packer->result[n];
         struct vertex *va = part->vertices;
 
         // NOTE: the blend color is used with SUBBITMAP_LIBASS only, so it
@@ -330,7 +363,7 @@ static int generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
 
         write_quad(&va[n * 6], t,
                    b->x, b->y, b->x + b->dw, b->y + b->dh,
-                   pos.x, pos.y, pos.x + b->w, pos.y + b->h,
+                   b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
                    part->w, part->h, color);
     }
 
diff --git a/video/out/opengl/superxbr.c b/video/out/opengl/superxbr.c
deleted file mode 100644
index 323ed18..0000000
--- a/video/out/opengl/superxbr.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "superxbr.h"
-
-#include <assert.h>
-
-#define GLSL(x) gl_sc_add(sc, #x "\n");
-#define GLSLF(...) gl_sc_addf(sc, __VA_ARGS__)
-#define GLSLH(x) gl_sc_hadd(sc, #x "\n");
-#define GLSLHF(...) gl_sc_haddf(sc, __VA_ARGS__)
-
-struct superxbr_opts {
-    float sharpness;
-    float edge_strength;
-};
-
-const struct superxbr_opts superxbr_opts_def = {
-    .sharpness = 1.0f,
-    .edge_strength = 0.6f,
-};
-
-#define OPT_BASE_STRUCT struct superxbr_opts
-const struct m_sub_options superxbr_conf = {
-    .opts = (const m_option_t[]) {
-        OPT_FLOATRANGE("sharpness", sharpness, 0, 0.0, 2.0),
-        OPT_FLOATRANGE("edge-strength", edge_strength, 0, 0.0, 1.0),
-        {0}
-    },
-    .size = sizeof(struct superxbr_opts),
-    .defaults = &superxbr_opts_def,
-};
-
-/*
-
-    *******  Super XBR Shader  *******
-
-    Copyright (c) 2015 Hyllian - sergiogdb@gmail.com
-
-    Permission is hereby granted, free of charge, to any person obtaining a copy
-    of this software and associated documentation files (the "Software"), to deal
-    in the Software without restriction, including without limitation the rights
-    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-    copies of the Software, and to permit persons to whom the Software is
-    furnished to do so, subject to the following conditions:
-
-    The above copyright notice and this permission notice shall be included in
-    all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-    THE SOFTWARE.
-
-*/
-
-struct step_params {
-    const float dstr, ostr; // sharpness strength modifiers
-    const int d1[3][3]; // 1-distance diagonal mask
-    const int d2[2][2]; // 2-distance diagonal mask
-    const int o1[3]; // 1-distance orthogonal mask
-    const int o2[3]; // 2-distance orthogonal mask
-};
-
-const struct step_params params[3] = {
-    {   .dstr = 0.129633,
-        .ostr = 0.175068,
-        .d1 = {{0, 1, 0},
-               {1, 2, 1},
-               {0, 1, 0}},
-        .d2 = {{-1,  0},
-               { 0, -1}},
-
-        .o1 = {1, 2, 1},
-        .o2 = { 0,  0},
-    }, {
-        .dstr = 0.175068,
-        .ostr = 0.129633,
-        .d1 = {{0, 1, 0},
-               {1, 4, 1},
-               {0, 1, 0}},
-        .d2 = {{ 0,  0},
-               { 0,  0}},
-
-        .o1 = {1, 4, 1},
-        .o2 = { 0,  0},
-    }
-};
-
-// Compute a single step of the superxbr process, assuming the input can be
-// sampled using i(x,y). Dumps its output into 'res'
-static void superxbr_step_h(struct gl_shader_cache *sc,
-                            const struct superxbr_opts *conf,
-                            const struct step_params *mask)
-{
-    GLSLHF("{ // step\n");
-
-    // Convolute along the diagonal and orthogonal lines
-    GLSLH(vec4 d1 = vec4( i(0,0), i(1,1), i(2,2), i(3,3) );)
-    GLSLH(vec4 d2 = vec4( i(0,3), i(1,2), i(2,1), i(3,0) );)
-    GLSLH(vec4 h1 = vec4( i(0,1), i(1,1), i(2,1), i(3,1) );)
-    GLSLH(vec4 h2 = vec4( i(0,2), i(1,2), i(2,2), i(3,2) );)
-    GLSLH(vec4 v1 = vec4( i(1,0), i(1,1), i(1,2), i(1,3) );)
-    GLSLH(vec4 v2 = vec4( i(2,0), i(2,1), i(2,2), i(2,3) );)
-
-    GLSLHF("float dw = %f;\n", conf->sharpness * mask->dstr);
-    GLSLHF("float ow = %f;\n", conf->sharpness * mask->ostr);
-    GLSLH(vec4 dk = vec4(-dw, dw+0.5, dw+0.5, -dw);) // diagonal kernel
-    GLSLH(vec4 ok = vec4(-ow, ow+0.5, ow+0.5, -ow);) // ortho kernel
-
-    // Convoluted results
-    GLSLH(float d1c = dot(d1, dk);)
-    GLSLH(float d2c = dot(d2, dk);)
-    GLSLH(float vc = dot(v1+v2, ok)/2.0;)
-    GLSLH(float hc = dot(h1+h2, ok)/2.0;)
-
-    // Compute diagonal edge strength using diagonal mask
-    GLSLH(float d_edge = 0;)
-    for (int x = 0; x < 3; x++) {
-        for (int y = 0; y < 3; y++) {
-            if (mask->d1[x][y]) {
-                // 1-distance diagonal neighbours
-                GLSLHF("d_edge += %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->d1[x][y], x+1, y, x, y+1);
-                GLSLHF("d_edge -= %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->d1[x][y], 3-y, x+1, 3-(y+1), x); // rotated
-            }
-            if (x < 2 && y < 2 && mask->d2[x][y]) {
-                // 2-distance diagonal neighbours
-                GLSLHF("d_edge += %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->d2[x][y], x+2, y, x, y+2);
-                GLSLHF("d_edge -= %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->d2[x][y], 3-y, x+2, 3-(y+2), x); // rotated
-            }
-        }
-    }
-
-    // Compute orthogonal edge strength using orthogonal mask
-    GLSLH(float o_edge = 0;)
-    for (int x = 1; x < 3; x++) {
-        for (int y = 0; y < 3; y++) {
-            if (mask->o1[y]) {
-                // 1-distance neighbours
-                GLSLHF("o_edge += %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->o1[y], x, y, x, y+1); // vertical
-                GLSLHF("o_edge -= %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->o1[y], y, x, y+1, x); // horizontal
-            }
-            if (y < 2 && mask->o2[y]) {
-                // 2-distance neighbours
-                GLSLHF("o_edge += %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->o2[y], x, y, x, y+2); // vertical
-                GLSLHF("o_edge -= %d * abs(i(%d,%d) - i(%d,%d));\n",
-                       mask->o2[x], y, x, y+2, x); // horizontal
-            }
-        }
-    }
-
-    // Pick the two best directions and mix them together
-    GLSLHF("float str = smoothstep(0.0, %f + 1e-6, abs(tex_mul*d_edge));\n",
-           conf->edge_strength);
-    GLSLH(res = mix(mix(d2c, d1c, step(0.0, d_edge)), \
-                    mix(hc,   vc, step(0.0, o_edge)), 1.0 - str);)
-
-    // Anti-ringing using center square
-    GLSLH(float lo = min(min( i(1,1), i(2,1) ), min( i(1,2), i(2,2) ));)
-    GLSLH(float hi = max(max( i(1,1), i(2,1) ), max( i(1,2), i(2,2) ));)
-    GLSLH(res = clamp(res, lo, hi);)
-
-    GLSLHF("} // step\n");
-}
-
-void pass_superxbr(struct gl_shader_cache *sc, int id, int step, float tex_mul,
-                   const struct superxbr_opts *conf,
-                   struct gl_transform *transform)
-{
-    if (!conf)
-        conf = &superxbr_opts_def;
-
-    assert(0 <= step && step < 2);
-    GLSLF("// superxbr (step %d)\n", step);
-    GLSLHF("#define tex texture%d\n", id);
-    GLSLHF("#define tex_size texture_size%d\n", id);
-    GLSLHF("#define tex_mul %f\n", tex_mul);
-    GLSLHF("#define pt pixel_size%d\n", id);
-
-    // We use a sub-function in the header so we can return early
-    GLSLHF("float superxbr(vec2 pos) {\n");
-    GLSLH(float i[4*4];)
-    GLSLH(float res;)
-    GLSLH(#define i(x,y) i[(x)*4+(y)])
-
-    if (step == 0) {
-        *transform = (struct gl_transform){{{2.0,0.0}, {0.0,2.0}}, {-0.5,-0.5}};
-        GLSLH(vec2 dir = fract(pos * tex_size) - 0.5;)
-
-        // Optimization: Discard (skip drawing) unused pixels, except those
-        // at the edge.
-        GLSLH(vec2 dist = tex_size * min(pos, vec2(1.0) - pos);)
-        GLSLH(if (dir.x * dir.y < 0.0 && dist.x > 1.0 && dist.y > 1.0)
-                  return 0.0;)
-
-        GLSLH(if (dir.x < 0.0 || dir.y < 0.0 || dist.x < 1.0 || dist.y < 1.0)
-                  return texture(tex, pos - pt * dir).x;)
-
-        // Load the input samples
-        GLSLH(for (int x = 0; x < 4; x++))
-        GLSLH(for (int y = 0; y < 4; y++))
-        GLSLH(i(x,y) = texture(tex, pos + pt * vec2(x-1.25, y-1.25)).x;)
-    } else {
-        *transform = (struct gl_transform){{{1.0,0.0}, {0.0,1.0}}, {0.0,0.0}};
-
-        GLSLH(vec2 dir = fract(pos * tex_size / 2.0) - 0.5;)
-        GLSLH(if (dir.x * dir.y > 0.0)
-                  return texture(tex, pos).x;)
-
-        GLSLH(for (int x = 0; x < 4; x++))
-        GLSLH(for (int y = 0; y < 4; y++))
-        GLSLH(i(x,y) = texture(tex, pos + pt * vec2(x+y-3, y-x)).x;)
-    }
-
-    superxbr_step_h(sc, conf, &params[step]);
-    GLSLH(return res;)
-    GLSLHF("}\n");
-
-    GLSLF("color.x = tex_mul * superxbr(texcoord%d);\n", id);
-}
diff --git a/video/out/opengl/superxbr.h b/video/out/opengl/superxbr.h
deleted file mode 100644
index 7aa46ef..0000000
--- a/video/out/opengl/superxbr.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_SUPERXBR_H
-#define MP_GL_SUPERXBR_H
-
-#include "common.h"
-#include "utils.h"
-
-extern const struct superxbr_opts superxbr_opts_def;
-extern const struct m_sub_options superxbr_conf;
-
-void pass_superxbr(struct gl_shader_cache *sc, int id, int step, float tex_mul,
-                   const struct superxbr_opts *conf,
-                   struct gl_transform *transform);
-
-#endif
diff --git a/video/out/opengl/user_shaders.c b/video/out/opengl/user_shaders.c
new file mode 100644
index 0000000..8f915a5
--- /dev/null
+++ b/video/out/opengl/user_shaders.c
@@ -0,0 +1,195 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <ctype.h>
+
+#include "user_shaders.h"
+
+static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
+{
+    int pos = 0;
+
+    while (line.len > 0) {
+        struct bstr word = bstr_strip(bstr_splitchar(line, &line, ' '));
+        if (word.len == 0)
+            continue;
+
+        if (pos >= MAX_SZEXP_SIZE)
+            return false;
+
+        struct szexp *exp = &out[pos++];
+
+        if (bstr_eatend0(&word, ".w") || bstr_eatend0(&word, ".width")) {
+            exp->tag = SZEXP_VAR_W;
+            exp->val.varname = word;
+            continue;
+        }
+
+        if (bstr_eatend0(&word, ".h") || bstr_eatend0(&word, ".height")) {
+            exp->tag = SZEXP_VAR_H;
+            exp->val.varname = word;
+            continue;
+        }
+
+        switch (word.start[0]) {
+        case '+': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_ADD; continue;
+        case '-': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_SUB; continue;
+        case '*': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_MUL; continue;
+        case '/': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_DIV; continue;
+        case '!': exp->tag = SZEXP_OP1; exp->val.op = SZEXP_OP_NOT; continue;
+        case '>': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_GT;  continue;
+        case '<': exp->tag = SZEXP_OP2; exp->val.op = SZEXP_OP_LT;  continue;
+        }
+
+        if (isdigit(word.start[0])) {
+            exp->tag = SZEXP_CONST;
+            if (bstr_sscanf(word, "%f", &exp->val.cval) != 1)
+                return false;
+            continue;
+        }
+
+        // Some sort of illegal expression
+        return false;
+    }
+
+    return true;
+}
+
+// Returns false if no more shaders could be parsed
+bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
+                            struct gl_user_shader *out)
+{
+    if (!body || !out || !body->start || body->len == 0)
+        return false;
+
+    *out = (struct gl_user_shader){
+        .offset = identity_trans,
+        .width = {{ SZEXP_VAR_W, { .varname = bstr0("HOOKED") }}},
+        .height = {{ SZEXP_VAR_H, { .varname = bstr0("HOOKED") }}},
+        .cond = {{ SZEXP_CONST, { .cval = 1.0 }}},
+    };
+
+    int hook_idx = 0;
+    int bind_idx = 0;
+
+    // Skip all garbage (e.g. comments) before the first header
+    int pos = bstr_find(*body, bstr0("//!"));
+    if (pos < 0) {
+        mp_warn(log, "Shader appears to contain no passes!\n");
+        return false;
+    }
+    *body = bstr_cut(*body, pos);
+
+    // First parse all the headers
+    while (true) {
+        struct bstr rest;
+        struct bstr line = bstr_strip(bstr_getline(*body, &rest));
+
+        // Check for the presence of the magic line beginning
+        if (!bstr_eatstart0(&line, "//!"))
+            break;
+
+        *body = rest;
+
+        // Parse the supported commands
+        if (bstr_eatstart0(&line, "HOOK")) {
+            if (hook_idx == SHADER_MAX_HOOKS) {
+                mp_err(log, "Passes may only hook up to %d textures!\n",
+                       SHADER_MAX_HOOKS);
+                return false;
+            }
+            out->hook_tex[hook_idx++] = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "BIND")) {
+            if (bind_idx == SHADER_MAX_BINDS) {
+                mp_err(log, "Passes may only bind up to %d textures!\n",
+                       SHADER_MAX_BINDS);
+                return false;
+            }
+            out->bind_tex[bind_idx++] = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "SAVE")) {
+            out->save_tex = bstr_strip(line);
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "OFFSET")) {
+            float ox, oy;
+            if (bstr_sscanf(line, "%f %f", &ox, &oy) != 2) {
+                mp_err(log, "Error while parsing OFFSET!\n");
+                return false;
+            }
+            out->offset.t[0] = ox;
+            out->offset.t[1] = oy;
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "WIDTH")) {
+            if (!parse_rpn_szexpr(line, out->width)) {
+                mp_err(log, "Error while parsing WIDTH!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "HEIGHT")) {
+            if (!parse_rpn_szexpr(line, out->height)) {
+                mp_err(log, "Error while parsing HEIGHT!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "WHEN")) {
+            if (!parse_rpn_szexpr(line, out->cond)) {
+                mp_err(log, "Error while parsing WHEN!\n");
+                return false;
+            }
+            continue;
+        }
+
+        if (bstr_eatstart0(&line, "COMPONENTS")) {
+            if (bstr_sscanf(line, "%d", &out->components) != 1) {
+                mp_err(log, "Error while parsing COMPONENTS!\n");
+                return false;
+            }
+            continue;
+        }
+
+        // Unknown command type
+        mp_err(log, "Unrecognized command '%.*s'!\n", BSTR_P(line));
+        return false;
+    }
+
+    // The rest of the file up until the next magic line beginning (if any)
+    // shall be the shader body
+    if (bstr_split_tok(*body, "//!", &out->pass_body, body)) {
+        // Make sure the magic line is part of the rest
+        body->start -= 3;
+        body->len += 3;
+    }
+
+    // Sanity checking
+    if (hook_idx == 0)
+        mp_warn(log, "Pass has no hooked textures (will be ignored)!\n");
+
+    return true;
+}
diff --git a/video/out/opengl/user_shaders.h b/video/out/opengl/user_shaders.h
new file mode 100644
index 0000000..b8c287b
--- /dev/null
+++ b/video/out/opengl/user_shaders.h
@@ -0,0 +1,74 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_USER_SHADERS_H
+#define MP_GL_USER_SHADERS_H
+
+#include "common.h"
+#include "utils.h"
+
+#define SHADER_API 1
+#define SHADER_MAX_HOOKS 16
+#define SHADER_MAX_BINDS 6
+#define MAX_SZEXP_SIZE 32
+
+enum szexp_op {
+    SZEXP_OP_ADD,
+    SZEXP_OP_SUB,
+    SZEXP_OP_MUL,
+    SZEXP_OP_DIV,
+    SZEXP_OP_NOT,
+    SZEXP_OP_GT,
+    SZEXP_OP_LT,
+};
+
+enum szexp_tag {
+    SZEXP_END = 0, // End of an RPN expression
+    SZEXP_CONST, // Push a constant value onto the stack
+    SZEXP_VAR_W, // Get the width/height of a named texture (variable)
+    SZEXP_VAR_H,
+    SZEXP_OP2, // Pop two elements and push the result of a dyadic operation
+    SZEXP_OP1, // Pop one element and push the result of a monadic operation
+};
+
+struct szexp {
+    enum szexp_tag tag;
+    union {
+        float cval;
+        struct bstr varname;
+        enum szexp_op op;
+    } val;
+};
+
+struct gl_user_shader {
+    struct bstr hook_tex[SHADER_MAX_HOOKS];
+    struct bstr bind_tex[SHADER_MAX_BINDS];
+    struct bstr save_tex;
+    struct bstr pass_body;
+    struct gl_transform offset;
+    struct szexp width[MAX_SZEXP_SIZE];
+    struct szexp height[MAX_SZEXP_SIZE];
+    struct szexp cond[MAX_SZEXP_SIZE];
+    int components;
+};
+
+// Parse the next shader pass from 'body'. Returns false if the end of the
+// string was reached
+bool parse_user_shader_pass(struct mp_log *log, struct bstr *body,
+                            struct gl_user_shader *out);
+
+#endif
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 40f1beb..73b411e 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -24,6 +24,7 @@
 #include <assert.h>
 
 #include "common/common.h"
+#include "formats.h"
 #include "utils.h"
 
 // GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
@@ -39,7 +40,7 @@ static const char *gl_error_to_string(GLenum error)
     }
 }
 
-void glCheckError(GL *gl, struct mp_log *log, const char *info)
+void gl_check_error(GL *gl, struct mp_log *log, const char *info)
 {
     for (;;) {
         GLenum error = gl->GetError();
@@ -50,52 +51,6 @@ void glCheckError(GL *gl, struct mp_log *log, const char *info)
     }
 }
 
-// return the number of bytes per pixel for the given format
-// does not handle all possible variants, just those used by mpv
-int glFmt2bpp(GLenum format, GLenum type)
-{
-    int component_size = 0;
-    switch (type) {
-    case GL_UNSIGNED_BYTE_3_3_2:
-    case GL_UNSIGNED_BYTE_2_3_3_REV:
-        return 1;
-    case GL_UNSIGNED_SHORT_5_5_5_1:
-    case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-    case GL_UNSIGNED_SHORT_5_6_5:
-    case GL_UNSIGNED_SHORT_5_6_5_REV:
-        return 2;
-    case GL_UNSIGNED_BYTE:
-        component_size = 1;
-        break;
-    case GL_UNSIGNED_SHORT:
-        component_size = 2;
-        break;
-    }
-    switch (format) {
-    case GL_LUMINANCE:
-    case GL_ALPHA:
-        return component_size;
-    case GL_RGB_422_APPLE:
-        return 2;
-    case GL_RGB:
-    case GL_BGR:
-    case GL_RGB_INTEGER:
-        return 3 * component_size;
-    case GL_RGBA:
-    case GL_BGRA:
-    case GL_RGBA_INTEGER:
-        return 4 * component_size;
-    case GL_RED:
-    case GL_RED_INTEGER:
-        return component_size;
-    case GL_RG:
-    case GL_LUMINANCE_ALPHA:
-    case GL_RG_INTEGER:
-        return 2 * component_size;
-    }
-    abort(); // unknown
-}
-
 static int get_alignment(int stride)
 {
     if (stride % 8 == 0)
@@ -112,28 +67,26 @@ static int get_alignment(int stride)
 //  format, type: texture parameters
 //  dataptr, stride: image data
 //  x, y, width, height: part of the image to upload
-//  slice: height of an upload slice, 0 for all at once
-void glUploadTex(GL *gl, GLenum target, GLenum format, GLenum type,
-                 const void *dataptr, int stride,
-                 int x, int y, int w, int h, int slice)
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h)
 {
+    int bpp = gl_bytes_per_pixel(format, type);
     const uint8_t *data = dataptr;
     int y_max = y + h;
-    if (w <= 0 || h <= 0)
+    if (w <= 0 || h <= 0 || !bpp)
         return;
-    if (slice <= 0)
-        slice = h;
     if (stride < 0) {
         data += (h - 1) * stride;
         stride = -stride;
     }
     gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
-    bool use_rowlength = slice > 1 && (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH);
-    if (use_rowlength) {
+    int slice = h;
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
         // this is not always correct, but should work for MPlayer
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / glFmt2bpp(format, type));
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
     } else {
-        if (stride != glFmt2bpp(format, type) * w)
+        if (stride != bpp * w)
             slice = 1; // very inefficient, but at least it works
     }
     for (; y + slice <= y_max; y += slice) {
@@ -142,37 +95,12 @@ void glUploadTex(GL *gl, GLenum target, GLenum format, GLenum type,
     }
     if (y < y_max)
         gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
-    if (use_rowlength)
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
         gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
     gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
 }
 
-// Like glUploadTex, but upload a byte array with all elements set to val.
-// If scratch is not NULL, points to a resizeable talloc memory block than can
-// be freely used by the function (for avoiding temporary memory allocations).
-void glClearTex(GL *gl, GLenum target, GLenum format, GLenum type,
-                int x, int y, int w, int h, uint8_t val, void **scratch)
-{
-    int bpp = glFmt2bpp(format, type);
-    int stride = w * bpp;
-    int size = h * stride;
-    if (size < 1)
-        return;
-    void *data = scratch ? *scratch : NULL;
-    if (talloc_get_size(data) < size)
-        data = talloc_realloc(NULL, data, char *, size);
-    memset(data, val, size);
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
-    gl->TexSubImage2D(target, 0, x, y, w, h, format, type, data);
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
-    if (scratch) {
-        *scratch = data;
-    } else {
-        talloc_free(data);
-    }
-}
-
-mp_image_t *glGetWindowScreenshot(GL *gl)
+mp_image_t *gl_read_window_contents(GL *gl)
 {
     if (gl->es)
         return NULL; // ES can't read from front buffer
@@ -307,32 +235,6 @@ void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
     gl_vao_unbind(vao);
 }
 
-struct gl_format {
-    GLenum format;
-    GLenum type;
-    GLint internal_format;
-};
-
-static const struct gl_format gl_formats[] = {
-    // GLES 3.0
-    {GL_RGB,    GL_UNSIGNED_BYTE,               GL_RGB},
-    {GL_RGBA,   GL_UNSIGNED_BYTE,               GL_RGBA},
-    {GL_RGB,    GL_UNSIGNED_BYTE,               GL_RGB8},
-    {GL_RGBA,   GL_UNSIGNED_BYTE,               GL_RGBA8},
-    {GL_RGB,    GL_UNSIGNED_SHORT,              GL_RGB16},
-    {GL_RGBA,   GL_UNSIGNED_INT_2_10_10_10_REV, GL_RGB10_A2},
-    // not texture filterable in GLES 3.0
-    {GL_RGB,    GL_FLOAT,                       GL_RGB16F},
-    {GL_RGBA,   GL_FLOAT,                       GL_RGBA16F},
-    {GL_RGB,    GL_FLOAT,                       GL_RGB32F},
-    {GL_RGBA,   GL_FLOAT,                       GL_RGBA32F},
-    // Desktop GL
-    {GL_RGB,    GL_UNSIGNED_SHORT,              GL_RGB10},
-    {GL_RGBA,   GL_UNSIGNED_SHORT,              GL_RGBA12},
-    {GL_RGBA,   GL_UNSIGNED_SHORT,              GL_RGBA16},
-    {0}
-};
-
 // Create a texture and a FBO using the texture as color attachments.
 //  iformat: texture internal format
 // Returns success.
@@ -363,6 +265,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
     if (fbo->rw == cw && fbo->rh == ch && fbo->iformat == iformat) {
         fbo->lw = w;
         fbo->lh = h;
+        fbotex_invalidate(fbo);
         return true;
     }
 
@@ -373,19 +276,18 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
     if (flags & FBOTEX_FUZZY_H)
         h = MP_ALIGN_UP(h, 256);
 
-    GLenum filter = fbo->tex_filter;
+    mp_verbose(log, "Create FBO: %dx%d (%dx%d)\n", lw, lh, w, h);
 
-    struct gl_format format = {
-        .format = GL_RGBA,
-        .type = GL_UNSIGNED_BYTE,
-        .internal_format = iformat,
-    };
-    for (int n = 0; gl_formats[n].format; n++) {
-        if (gl_formats[n].internal_format == format.internal_format) {
-            format = gl_formats[n];
-            break;
-        }
+    const struct gl_format *format = gl_find_internal_format(gl, iformat);
+    if (!format || (format->flags & F_CF) != F_CF) {
+        mp_verbose(log, "Format 0x%x not supported.\n", (unsigned)iformat);
+        return false;
     }
+    assert(gl->mpgl_caps & MPGL_CAP_FB);
+
+    GLenum filter = fbo->tex_filter;
+
+    fbotex_uninit(fbo);
 
     *fbo = (struct fbotex) {
         .gl = gl,
@@ -396,24 +298,18 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
         .iformat = iformat,
     };
 
-    mp_verbose(log, "Create FBO: %dx%d -> %dx%d\n", fbo->lw, fbo->lh,
-                                                    fbo->rw, fbo->rh);
-
-    if (!(gl->mpgl_caps & MPGL_CAP_FB))
-        return false;
-
     gl->GenFramebuffers(1, &fbo->fbo);
     gl->GenTextures(1, &fbo->texture);
     gl->BindTexture(GL_TEXTURE_2D, fbo->texture);
-    gl->TexImage2D(GL_TEXTURE_2D, 0, format.internal_format, fbo->rw, fbo->rh, 0,
-                   format.format, format.type, NULL);
+    gl->TexImage2D(GL_TEXTURE_2D, 0, format->internal_format, fbo->rw, fbo->rh, 0,
+                   format->format, format->type, NULL);
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
     gl->BindTexture(GL_TEXTURE_2D, 0);
 
     fbotex_set_filter(fbo, filter ? filter : GL_LINEAR);
 
-    glCheckError(gl, log, "after creating framebuffer texture");
+    gl_check_error(gl, log, "after creating framebuffer texture");
 
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo->fbo);
     gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
@@ -428,7 +324,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
 
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
 
-    glCheckError(gl, log, "after creating framebuffer");
+    gl_check_error(gl, log, "after creating framebuffer");
 
     return res;
 }
@@ -457,6 +353,20 @@ void fbotex_uninit(struct fbotex *fbo)
     }
 }
 
+// Mark framebuffer contents as unneeded.
+void fbotex_invalidate(struct fbotex *fbo)
+{
+    GL *gl = fbo->gl;
+
+    if (!fbo->fbo || !gl->InvalidateFramebuffer)
+        return;
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo->fbo);
+    gl->InvalidateFramebuffer(GL_FRAMEBUFFER, 1,
+                              (GLenum[]){GL_COLOR_ATTACHMENT0});
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+}
+
 // Standard parallel 2D projection, except y1 < y0 means that the coordinate
 // system is flipped, not the projection.
 void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
@@ -510,24 +420,19 @@ void gl_set_debug_logger(GL *gl, struct mp_log *log)
         gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
 }
 
-#define SC_ENTRIES 32
-#define SC_UNIFORM_ENTRIES 20
+// Force cache flush if more than this number of shaders is created.
+#define SC_MAX_ENTRIES 48
 
 enum uniform_type {
     UT_invalid,
     UT_i,
     UT_f,
     UT_m,
-    UT_buffer,
 };
 
 union uniform_val {
     GLfloat f[9];
     GLint i[4];
-    struct {
-        char* text;
-        GLint binding;
-    } buffer;
 };
 
 struct sc_uniform {
@@ -539,10 +444,15 @@ struct sc_uniform {
     union uniform_val v;
 };
 
+struct sc_cached_uniform {
+    GLint loc;
+    union uniform_val v;
+};
+
 struct sc_entry {
     GLuint gl_shader;
-    GLint uniform_locs[SC_UNIFORM_ENTRIES];
-    union uniform_val cached_v[SC_UNIFORM_ENTRIES];
+    struct sc_cached_uniform *uniforms;
+    int num_uniforms;
     bstr frag;
     bstr vert;
     struct gl_vao *vao;
@@ -552,18 +462,24 @@ struct gl_shader_cache {
     GL *gl;
     struct mp_log *log;
 
-    // this is modified during use (gl_sc_add() etc.)
+    // permanent
+    char **exts;
+    int num_exts;
+
+    // this is modified during use (gl_sc_add() etc.) and reset for each shader
     bstr prelude_text;
     bstr header_text;
     bstr text;
     struct gl_vao *vao;
 
-    struct sc_entry entries[SC_ENTRIES];
+    struct sc_entry *entries;
     int num_entries;
 
-    struct sc_uniform uniforms[SC_UNIFORM_ENTRIES];
+    struct sc_uniform *uniforms;
     int num_uniforms;
 
+    bool error_state; // true if an error occurred
+
     // temporary buffers (avoids frequent reallocations)
     bstr tmp[5];
 };
@@ -583,21 +499,21 @@ void gl_sc_reset(struct gl_shader_cache *sc)
     sc->prelude_text.len = 0;
     sc->header_text.len = 0;
     sc->text.len = 0;
-    for (int n = 0; n < sc->num_uniforms; n++) {
+    for (int n = 0; n < sc->num_uniforms; n++)
         talloc_free(sc->uniforms[n].name);
-        if (sc->uniforms[n].type == UT_buffer)
-            talloc_free(sc->uniforms[n].v.buffer.text);
-    }
     sc->num_uniforms = 0;
 }
 
 static void sc_flush_cache(struct gl_shader_cache *sc)
 {
+    MP_VERBOSE(sc, "flushing shader cache\n");
+
     for (int n = 0; n < sc->num_entries; n++) {
         struct sc_entry *e = &sc->entries[n];
         sc->gl->DeleteProgram(e->gl_shader);
         talloc_free(e->vert.start);
         talloc_free(e->frag.start);
+        talloc_free(e->uniforms);
     }
     sc->num_entries = 0;
 }
@@ -611,9 +527,23 @@ void gl_sc_destroy(struct gl_shader_cache *sc)
     talloc_free(sc);
 }
 
+bool gl_sc_error_state(struct gl_shader_cache *sc)
+{
+    return sc->error_state;
+}
+
+void gl_sc_reset_error(struct gl_shader_cache *sc)
+{
+    sc->error_state = false;
+}
+
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name)
 {
-    bstr_xappend_asprintf(sc, &sc->prelude_text, "#extension %s : enable\n", name);
+    for (int n = 0; n < sc->num_exts; n++) {
+        if (strcmp(sc->exts[n], name) == 0)
+            return;
+    }
+    MP_TARRAY_APPEND(sc, sc->exts, sc->num_exts, talloc_strdup(sc, name));
 }
 
 #define bstr_xappend0(sc, b, s) bstr_xappend(sc, b, bstr0(s))
@@ -644,6 +574,11 @@ void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
     va_end(ap);
 }
 
+void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text)
+{
+    bstr_xappend(sc, &sc->header_text, text);
+}
+
 static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
                                        const char *name)
 {
@@ -652,10 +587,12 @@ static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
             return &sc->uniforms[n];
     }
     // not found -> add it
-    assert(sc->num_uniforms < SC_UNIFORM_ENTRIES); // just don't have too many
-    struct sc_uniform *new = &sc->uniforms[sc->num_uniforms++];
-    *new = (struct sc_uniform) { .loc = -1, .name = talloc_strdup(NULL, name) };
-    return new;
+    struct sc_uniform new = {
+        .loc = -1,
+        .name = talloc_strdup(NULL, name),
+    };
+    MP_TARRAY_APPEND(sc, sc->uniforms, sc->num_uniforms, new);
+    return &sc->uniforms[sc->num_uniforms - 1];
 }
 
 const char* mp_sampler_type(GLenum texture_target)
@@ -664,6 +601,7 @@ const char* mp_sampler_type(GLenum texture_target)
     case GL_TEXTURE_1D:         return "sampler1D";
     case GL_TEXTURE_2D:         return "sampler2D";
     case GL_TEXTURE_RECTANGLE:  return "sampler2DRect";
+    case GL_TEXTURE_EXTERNAL_OES: return "samplerExternalOES";
     case GL_TEXTURE_3D:         return "sampler3D";
     default: abort();
     }
@@ -765,15 +703,6 @@ void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
         transpose3x3(&u->v.f[0]);
 }
 
-void gl_sc_uniform_buffer(struct gl_shader_cache *sc, char *name,
-                          const char *text, int binding)
-{
-    struct sc_uniform *u = find_uniform(sc, name);
-    u->type = UT_buffer;
-    u->v.buffer.text = talloc_strdup(sc, text);
-    u->v.buffer.binding = binding;
-}
-
 // This will call glBindAttribLocation() on the shader before it's linked
 // (OpenGL requires this to happen before linking). Basically, it associates
 // the input variable names with the fields in the vao.
@@ -800,25 +729,21 @@ static const char *vao_glsl_type(const struct gl_vao_entry *e)
 // Assumes program is current (gl->UseProgram(program)).
 static void update_uniform(GL *gl, struct sc_entry *e, struct sc_uniform *u, int n)
 {
-    if (u->type == UT_buffer) {
-        GLuint idx = gl->GetUniformBlockIndex(e->gl_shader, u->name);
-        gl->UniformBlockBinding(e->gl_shader, idx, u->v.buffer.binding);
-        return;
-    }
-    GLint loc = e->uniform_locs[n];
+    struct sc_cached_uniform *un = &e->uniforms[n];
+    GLint loc = un->loc;
     if (loc < 0)
         return;
     switch (u->type) {
     case UT_i:
         assert(u->size == 1);
-        if (memcmp(e->cached_v[n].i, u->v.i, sizeof(u->v.i)) != 0) {
-            memcpy(e->cached_v[n].i, u->v.i, sizeof(u->v.i));
+        if (memcmp(un->v.i, u->v.i, sizeof(u->v.i)) != 0) {
+            memcpy(un->v.i, u->v.i, sizeof(u->v.i));
             gl->Uniform1i(loc, u->v.i[0]);
         }
         break;
     case UT_f:
-        if (memcmp(e->cached_v[n].f, u->v.f, sizeof(u->v.f)) != 0) {
-            memcpy(e->cached_v[n].f, u->v.f, sizeof(u->v.f));
+        if (memcmp(un->v.f, u->v.f, sizeof(u->v.f)) != 0) {
+            memcpy(un->v.f, u->v.f, sizeof(u->v.f));
             switch (u->size) {
             case 1: gl->Uniform1f(loc, u->v.f[0]); break;
             case 2: gl->Uniform2f(loc, u->v.f[0], u->v.f[1]); break;
@@ -830,8 +755,8 @@ static void update_uniform(GL *gl, struct sc_entry *e, struct sc_uniform *u, int
         }
         break;
     case UT_m:
-        if (memcmp(e->cached_v[n].f, u->v.f, sizeof(u->v.f)) != 0) {
-            memcpy(e->cached_v[n].f, u->v.f, sizeof(u->v.f));
+        if (memcmp(un->v.f, u->v.f, sizeof(u->v.f)) != 0) {
+            memcpy(un->v.f, u->v.f, sizeof(u->v.f));
             switch (u->size) {
             case 2: gl->UniformMatrix2fv(loc, 1, GL_FALSE, &u->v.f[0]); break;
             case 3: gl->UniformMatrix3fv(loc, 1, GL_FALSE, &u->v.f[0]); break;
@@ -870,9 +795,22 @@ static void compile_attach_shader(struct gl_shader_cache *sc, GLuint program,
                typestr, status, logstr);
         talloc_free(logstr);
     }
+    if (gl->GetTranslatedShaderSourceANGLE && mp_msg_test(sc->log, MSGL_DEBUG)) {
+        GLint len = 0;
+        gl->GetShaderiv(shader, GL_TRANSLATED_SHADER_SOURCE_LENGTH_ANGLE, &len);
+        if (len > 0) {
+            GLchar *sstr = talloc_zero_size(NULL, len + 1);
+            gl->GetTranslatedShaderSourceANGLE(shader, len, NULL, sstr);
+            MP_DBG(sc, "Translated shader:\n");
+            mp_log_source(sc->log, MSGL_DEBUG, sstr);
+        }
+    }
 
     gl->AttachShader(program, shader);
     gl->DeleteShader(shader);
+
+    if (!status)
+        sc->error_state = true;
 }
 
 static void link_shader(struct gl_shader_cache *sc, GLuint program)
@@ -891,6 +829,9 @@ static void link_shader(struct gl_shader_cache *sc, GLuint program)
         MP_MSG(sc, pri, "shader link log (status=%d): %s\n", status, logstr);
         talloc_free(logstr);
     }
+
+    if (!status)
+        sc->error_state = true;
 }
 
 static GLuint create_program(struct gl_shader_cache *sc, const char *vertex,
@@ -939,8 +880,14 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
     // set up shader text (header + uniforms + body)
     bstr *header = &sc->tmp[0];
     ADD(header, "#version %d%s\n", gl->glsl_version, gl->es >= 300 ? " es" : "");
-    if (gl->es)
+    for (int n = 0; n < sc->num_exts; n++)
+        ADD(header, "#extension %s : enable\n", sc->exts[n]);
+    if (gl->es) {
         ADD(header, "precision mediump float;\n");
+        ADD(header, "precision mediump sampler2D;\n");
+        if (gl->mpgl_caps & MPGL_CAP_3D_TEX)
+            ADD(header, "precision mediump sampler3D;\n");
+    }
     ADD_BSTR(header, sc->prelude_text);
     char *vert_in = gl->glsl_version >= 130 ? "in" : "attribute";
     char *vert_out = gl->glsl_version >= 130 ? "out" : "varying";
@@ -975,7 +922,6 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
     // fragment shader; still requires adding used uniforms and VAO elements
     bstr *frag = &sc->tmp[4];
     ADD_BSTR(frag, *header);
-    ADD(frag, "#define RG %s\n", gl->mpgl_caps & MPGL_CAP_TEX_RG ? "rg" : "ra");
     if (gl->glsl_version >= 130) {
         ADD(frag, "#define texture1D texture\n");
         ADD(frag, "#define texture3D texture\n");
@@ -986,11 +932,7 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
     ADD_BSTR(frag, *frag_vaos);
     for (int n = 0; n < sc->num_uniforms; n++) {
         struct sc_uniform *u = &sc->uniforms[n];
-        if (u->type == UT_buffer) {
-            ADD(frag, "uniform %s { %s };\n", u->name, u->v.buffer.text);
-        } else {
-            ADD(frag, "uniform %s %s;\n", u->glsl_type, u->name);
-        }
+        ADD(frag, "uniform %s %s;\n", u->glsl_type, u->name);
     }
 
     // Additional helpers.
@@ -1023,8 +965,9 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
         }
     }
     if (!entry) {
-        if (sc->num_entries == SC_ENTRIES)
+        if (sc->num_entries == SC_MAX_ENTRIES)
             sc_flush_cache(sc);
+        MP_TARRAY_GROW(sc, sc->entries, sc->num_entries);
         entry = &sc->entries[sc->num_entries++];
         *entry = (struct sc_entry){
             .vert = bstrdup(NULL, *vert),
@@ -1035,15 +978,146 @@ void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc)
     if (!entry->gl_shader) {
         entry->gl_shader = create_program(sc, vert->start, frag->start);
         for (int n = 0; n < sc->num_uniforms; n++) {
-            entry->uniform_locs[n] = gl->GetUniformLocation(entry->gl_shader,
-                                                            sc->uniforms[n].name);
+            struct sc_cached_uniform un = {
+                .loc = gl->GetUniformLocation(entry->gl_shader,
+                                              sc->uniforms[n].name),
+            };
+            MP_TARRAY_APPEND(sc, entry->uniforms, entry->num_uniforms, un);
         }
     }
 
     gl->UseProgram(entry->gl_shader);
 
+    assert(sc->num_uniforms == entry->num_uniforms);
+
     for (int n = 0; n < sc->num_uniforms; n++)
         update_uniform(gl, entry, &sc->uniforms[n], n);
 
     gl_sc_reset(sc);
 }
+
+// Maximum number of simultaneous query objects to keep around. Reducing this
+// number might cause rendering to block until the result of a previous query is
+// available
+#define QUERY_OBJECT_NUM 8
+
+// How many samples to keep around, for the sake of average and peak
+// calculations. This corresponds to a few seconds (exact time variable)
+#define QUERY_SAMPLE_SIZE 256
+
+struct gl_timer {
+    GL *gl;
+    GLuint query[QUERY_OBJECT_NUM];
+    int query_idx;
+
+    GLuint64 samples[QUERY_SAMPLE_SIZE];
+    int sample_idx;
+    int sample_count;
+
+    uint64_t avg_sum;
+    uint64_t peak;
+};
+
+int gl_timer_sample_count(struct gl_timer *timer)
+{
+    return timer->sample_count;
+}
+
+uint64_t gl_timer_last_us(struct gl_timer *timer)
+{
+    return timer->samples[(timer->sample_idx - 1) % QUERY_SAMPLE_SIZE] / 1000;
+}
+
+uint64_t gl_timer_avg_us(struct gl_timer *timer)
+{
+    if (timer->sample_count <= 0)
+        return 0;
+
+    return timer->avg_sum / timer->sample_count / 1000;
+}
+
+uint64_t gl_timer_peak_us(struct gl_timer *timer)
+{
+    return timer->peak / 1000;
+}
+
+struct gl_timer *gl_timer_create(GL *gl)
+{
+    struct gl_timer *timer = talloc_ptrtype(NULL, timer);
+    *timer = (struct gl_timer){ .gl = gl };
+
+    if (gl->GenQueries)
+        gl->GenQueries(QUERY_OBJECT_NUM, timer->query);
+
+    return timer;
+}
+
+void gl_timer_free(struct gl_timer *timer)
+{
+    if (!timer)
+        return;
+
+    GL *gl = timer->gl;
+    if (gl && gl->DeleteQueries) {
+        // this is a no-op on already uninitialized queries
+        gl->DeleteQueries(QUERY_OBJECT_NUM, timer->query);
+    }
+
+    talloc_free(timer);
+}
+
+static void gl_timer_record(struct gl_timer *timer, GLuint64 new)
+{
+    // Input res into the buffer and grab the previous value
+    GLuint64 old = timer->samples[timer->sample_idx];
+    timer->samples[timer->sample_idx++] = new;
+    timer->sample_idx %= QUERY_SAMPLE_SIZE;
+
+    // Update average and sum
+    timer->avg_sum = timer->avg_sum + new - old;
+    timer->sample_count = MPMIN(timer->sample_count + 1, QUERY_SAMPLE_SIZE);
+
+    // Update peak if necessary
+    if (new >= timer->peak) {
+        timer->peak = new;
+    } else if (timer->peak == old) {
+        // It's possible that the last peak was the value we just removed,
+        // if so we need to scan for the new peak
+        uint64_t peak = new;
+        for (int i = 0; i < QUERY_SAMPLE_SIZE; i++)
+            peak = MPMAX(peak, timer->samples[i]);
+        timer->peak = peak;
+    }
+}
+
+// If no free query is available, this can block. Shouldn't ever happen in
+// practice, though. (If it does, consider increasing QUERY_OBJECT_NUM)
+// IMPORTANT: only one gl_timer object may ever be active at a single time.
+// The caling code *MUST* ensure this
+void gl_timer_start(struct gl_timer *timer)
+{
+    GL *gl = timer->gl;
+    if (!gl->BeginQuery)
+        return;
+
+    // Get the next query object
+    GLuint id = timer->query[timer->query_idx++];
+    timer->query_idx %= QUERY_OBJECT_NUM;
+
+    // If this query object already holds a result, we need to get and
+    // record it first
+    if (gl->IsQuery(id)) {
+        GLuint64 elapsed;
+        gl->GetQueryObjectui64v(id, GL_QUERY_RESULT, &elapsed);
+        gl_timer_record(timer, elapsed);
+    }
+
+    gl->BeginQuery(GL_TIME_ELAPSED, id);
+}
+
+void gl_timer_stop(struct gl_timer *timer)
+{
+    GL *gl = timer->gl;
+    if (gl->EndQuery)
+        gl->EndQuery(GL_TIME_ELAPSED);
+}
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 170e24d..9b4fd84 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -20,19 +20,17 @@
 #define MP_GL_UTILS_
 
 #include "common.h"
+#include "math.h"
 
 struct mp_log;
 
-void glCheckError(GL *gl, struct mp_log *log, const char *info);
+void gl_check_error(GL *gl, struct mp_log *log, const char *info);
 
-int glFmt2bpp(GLenum format, GLenum type);
-void glUploadTex(GL *gl, GLenum target, GLenum format, GLenum type,
-                 const void *dataptr, int stride,
-                 int x, int y, int w, int h, int slice);
-void glClearTex(GL *gl, GLenum target, GLenum format, GLenum type,
-                int x, int y, int w, int h, uint8_t val, void **scratch);
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h);
 
-mp_image_t *glGetWindowScreenshot(GL *gl);
+mp_image_t *gl_read_window_contents(GL *gl);
 
 const char* mp_sampler_type(GLenum texture_target);
 
@@ -84,6 +82,7 @@ bool fbotex_change(struct fbotex *fbo, GL *gl, struct mp_log *log, int w, int h,
 #define FBOTEX_FUZZY_H 2
 #define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
 void fbotex_set_filter(struct fbotex *fbo, GLenum gl_filter);
+void fbotex_invalidate(struct fbotex *fbo);
 
 // A 3x2 matrix, with the translation part separate.
 struct gl_transform {
@@ -115,6 +114,13 @@ struct mp_rect_f {
     float x0, y0, x1, y1;
 };
 
+// Semantic equality (fuzzy comparison)
+static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
+{
+    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
+           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
+}
+
 static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
 {
     gl_transform_vec(t, &r->x0, &r->y0);
@@ -141,10 +147,13 @@ struct gl_shader_cache;
 
 struct gl_shader_cache *gl_sc_create(GL *gl, struct mp_log *log);
 void gl_sc_destroy(struct gl_shader_cache *sc);
+bool gl_sc_error_state(struct gl_shader_cache *sc);
+void gl_sc_reset_error(struct gl_shader_cache *sc);
 void gl_sc_add(struct gl_shader_cache *sc, const char *text);
 void gl_sc_addf(struct gl_shader_cache *sc, const char *textf, ...);
 void gl_sc_hadd(struct gl_shader_cache *sc, const char *text);
 void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...);
+void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
 void gl_sc_uniform_sampler(struct gl_shader_cache *sc, char *name, GLenum target,
                            int unit);
 void gl_sc_uniform_sampler_ui(struct gl_shader_cache *sc, char *name, int unit);
@@ -156,11 +165,21 @@ void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
                         bool transpose, GLfloat *v);
 void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
                         bool transpose, GLfloat *v);
-void gl_sc_uniform_buffer(struct gl_shader_cache *sc, char *name,
-                          const char *text, int binding);
 void gl_sc_set_vao(struct gl_shader_cache *sc, struct gl_vao *vao);
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
 void gl_sc_gen_shader_and_reset(struct gl_shader_cache *sc);
 void gl_sc_reset(struct gl_shader_cache *sc);
 
+struct gl_timer;
+
+struct gl_timer *gl_timer_create(GL *gl);
+void gl_timer_free(struct gl_timer *timer);
+void gl_timer_start(struct gl_timer *timer);
+void gl_timer_stop(struct gl_timer *timer);
+
+int gl_timer_sample_count(struct gl_timer *timer);
+uint64_t gl_timer_last_us(struct gl_timer *timer);
+uint64_t gl_timer_avg_us(struct gl_timer *timer);
+uint64_t gl_timer_peak_us(struct gl_timer *timer);
+
 #endif
diff --git a/video/out/opengl/video.c b/video/out/opengl/video.c
index 8807b65..f46fdc1 100644
--- a/video/out/opengl/video.c
+++ b/video/out/opengl/video.c
@@ -17,6 +17,7 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stdarg.h>
 #include <stdbool.h>
 #include <string.h>
 #include <assert.h>
@@ -31,24 +32,22 @@
 #include "common/global.h"
 #include "options/options.h"
 #include "common.h"
+#include "formats.h"
 #include "utils.h"
 #include "hwdec.h"
 #include "osd.h"
 #include "stream/stream.h"
-#include "superxbr.h"
-#include "nnedi3.h"
 #include "video_shaders.h"
+#include "user_shaders.h"
 #include "video/out/filter_kernels.h"
 #include "video/out/aspect.h"
 #include "video/out/bitmap_packer.h"
 #include "video/out/dither.h"
 #include "video/out/vo.h"
 
-// Maximal number of passes that prescaler can be applied.
-#define MAX_PRESCALE_PASSES 5
-
-// Maximal number of steps each pass of prescaling contains
-#define MAX_PRESCALE_STEPS 2
+// Maximal number of saved textures (for user script purposes)
+#define MAX_TEXTURE_HOOKS 16
+#define MAX_SAVED_TEXTURES 32
 
 // scale/cscale arguments that map directly to shader filter routines.
 // Note that the convolution filters are not included in this list.
@@ -91,6 +90,7 @@ static const struct gl_vao_entry vertex_vao[] = {
 
 struct texplane {
     int w, h;
+    int tex_w, tex_h;
     GLint gl_internal_format;
     GLenum gl_target;
     bool use_integer;
@@ -98,12 +98,14 @@ struct texplane {
     GLenum gl_type;
     GLuint gl_texture;
     int gl_buffer;
+    char swizzle[5];
 };
 
 struct video_image {
     struct texplane planes[4];
     bool image_flipped;
     struct mp_image *mpi;       // original input image
+    bool hwdec_mapped;
 };
 
 enum plane_type {
@@ -125,10 +127,29 @@ struct img_tex {
     GLenum gl_target;
     bool use_integer;
     int tex_w, tex_h; // source texture size
-    int w, h; // logical size (with pre_transform applied)
-    struct gl_transform pre_transform; // source texture space
+    int w, h; // logical size (after transformation)
     struct gl_transform transform; // rendering transformation
-    bool texture_la; // it's a GL_LUMINANCE_ALPHA texture (access with .ra not .rg)
+    char swizzle[5];
+};
+
+// A named img_tex, for user scripting purposes
+struct saved_tex {
+    const char *name;
+    struct img_tex tex;
+};
+
+// A texture hook. This is some operation that transforms a named texture as
+// soon as it's generated
+struct tex_hook {
+    char *hook_tex;
+    char *save_tex;
+    char *bind_tex[TEXUNIT_VIDEO_NUM];
+    int components; // how many components are relevant (0 = same as input)
+    void *priv; // this can be set to whatever the hook wants
+    void (*hook)(struct gl_video *p, struct img_tex tex, // generates GLSL
+                 struct gl_transform *trans, void *priv);
+    void (*free)(struct tex_hook *hook);
+    bool (*cond)(struct gl_video *p, struct img_tex tex, void *priv);
 };
 
 struct fbosurface {
@@ -140,7 +161,7 @@ struct fbosurface {
 
 struct cached_file {
     char *path;
-    char *body;
+    struct bstr body;
 };
 
 struct gl_video {
@@ -149,15 +170,15 @@ struct gl_video {
     struct mpv_global *global;
     struct mp_log *log;
     struct gl_video_opts opts;
+    struct gl_video_opts *opts_alloc;
     struct gl_lcms *cms;
     bool gl_debug;
 
     int texture_16bit_depth;    // actual bits available in 16 bit textures
+    int fb_depth;               // actual bits available in GL main framebuffer
 
     struct gl_shader_cache *sc;
 
-    GLenum gl_target; // texture target (GL_TEXTURE_2D, ...) for video and FBOs
-
     struct gl_vao vao;
 
     struct osd_state *osd_state;
@@ -170,7 +191,9 @@ struct gl_video {
     GLuint dither_texture;
     int dither_size;
 
-    GLuint nnedi3_weights_buffer;
+    struct gl_timer *upload_timer;
+    struct gl_timer *render_timer;
+    struct gl_timer *present_timer;
 
     struct mp_image_params real_image_params;   // configured format
     struct mp_image_params image_params;        // texture format (mind hwdec case)
@@ -188,21 +211,13 @@ struct gl_video {
     bool forced_dumb_mode;
 
     struct fbotex merge_fbo[4];
-    struct fbotex deband_fbo[4];
     struct fbotex scale_fbo[4];
     struct fbotex integer_fbo[4];
     struct fbotex indirect_fbo;
     struct fbotex blend_subs_fbo;
-    struct fbotex unsharp_fbo;
     struct fbotex output_fbo;
     struct fbosurface surfaces[FBOSURFACES_MAX];
-
-    // these are duplicated so we can keep rendering back and forth between
-    // them to support an unlimited number of shader passes per step
-    struct fbotex pre_fbo[2];
-    struct fbotex post_fbo[2];
-
-    struct fbotex prescale_fbo[MAX_PRESCALE_PASSES][MAX_PRESCALE_STEPS];
+    struct fbotex vdpau_deinterleave_fbo[2];
 
     int surface_idx;
     int surface_now;
@@ -229,6 +244,14 @@ struct gl_video {
     bool use_linear;
     float user_gamma;
 
+    // hooks and saved textures
+    struct saved_tex saved_tex[MAX_SAVED_TEXTURES];
+    int saved_tex_num;
+    struct tex_hook tex_hooks[MAX_TEXTURE_HOOKS];
+    int tex_hook_num;
+    struct fbotex hook_fbos[MAX_SAVED_TEXTURES];
+    int hook_fbo_num;
+
     int frames_uploaded;
     int frames_rendered;
     AVLFG lfg;
@@ -237,7 +260,7 @@ struct gl_video {
     int last_dither_matrix_size;
     float *last_dither_matrix;
 
-    struct cached_file files[10];
+    struct cached_file *files;
     int num_files;
 
     struct gl_hwdec *hwdec;
@@ -245,89 +268,7 @@ struct gl_video {
 
     bool dsi_warned;
     bool custom_shader_fn_warned;
-};
-
-struct fmt_entry {
-    int mp_format;
-    GLint internal_format;
-    GLenum format;
-    GLenum type;
-};
-
-// Very special formats, for which OpenGL happens to have direct support
-static const struct fmt_entry mp_to_gl_formats[] = {
-    {IMGFMT_RGB565,  GL_RGB,   GL_RGB,  GL_UNSIGNED_SHORT_5_6_5},
-    {0},
-};
-
-static const struct fmt_entry gl_byte_formats[] = {
-    {0, GL_RED,     GL_RED,     GL_UNSIGNED_BYTE},      // 1 x 8
-    {0, GL_RG,      GL_RG,      GL_UNSIGNED_BYTE},      // 2 x 8
-    {0, GL_RGB,     GL_RGB,     GL_UNSIGNED_BYTE},      // 3 x 8
-    {0, GL_RGBA,    GL_RGBA,    GL_UNSIGNED_BYTE},      // 4 x 8
-    {0, GL_R16,     GL_RED,     GL_UNSIGNED_SHORT},     // 1 x 16
-    {0, GL_RG16,    GL_RG,      GL_UNSIGNED_SHORT},     // 2 x 16
-    {0, GL_RGB16,   GL_RGB,     GL_UNSIGNED_SHORT},     // 3 x 16
-    {0, GL_RGBA16,  GL_RGBA,    GL_UNSIGNED_SHORT},     // 4 x 16
-};
-
-static const struct fmt_entry gl_byte_formats_gles3[] = {
-    {0, GL_R8,       GL_RED,    GL_UNSIGNED_BYTE},      // 1 x 8
-    {0, GL_RG8,      GL_RG,     GL_UNSIGNED_BYTE},      // 2 x 8
-    {0, GL_RGB8,     GL_RGB,    GL_UNSIGNED_BYTE},      // 3 x 8
-    {0, GL_RGBA8,    GL_RGBA,   GL_UNSIGNED_BYTE},      // 4 x 8
-    // There are no filterable texture formats that can be uploaded as
-    // GL_UNSIGNED_SHORT, so apparently we're out of luck.
-    {0, 0,           0,         0},                     // 1 x 16
-    {0, 0,           0,         0},                     // 2 x 16
-    {0, 0,           0,         0},                     // 3 x 16
-    {0, 0,           0,         0},                     // 4 x 16
-};
-
-static const struct fmt_entry gl_ui_byte_formats_gles3[] = {
-    {0, GL_R8UI,      GL_RED_INTEGER,   GL_UNSIGNED_BYTE},  // 1 x 8
-    {0, GL_RG8UI,     GL_RG_INTEGER,    GL_UNSIGNED_BYTE},  // 2 x 8
-    {0, GL_RGB8UI,    GL_RGB_INTEGER,   GL_UNSIGNED_BYTE},  // 3 x 8
-    {0, GL_RGBA8UI,   GL_RGBA_INTEGER,  GL_UNSIGNED_BYTE},  // 4 x 8
-    {0, GL_R16UI,     GL_RED_INTEGER,   GL_UNSIGNED_SHORT}, // 1 x 16
-    {0, GL_RG16UI,    GL_RG_INTEGER,    GL_UNSIGNED_SHORT}, // 2 x 16
-    {0, GL_RGB16UI,   GL_RGB_INTEGER,   GL_UNSIGNED_SHORT}, // 3 x 16
-    {0, GL_RGBA16UI,  GL_RGBA_INTEGER,  GL_UNSIGNED_SHORT}, // 4 x 16
-};
-
-static const struct fmt_entry gl_byte_formats_gles2[] = {
-    {0, GL_LUMINANCE,           GL_LUMINANCE,       GL_UNSIGNED_BYTE}, // 1 x 8
-    {0, GL_LUMINANCE_ALPHA,     GL_LUMINANCE_ALPHA, GL_UNSIGNED_BYTE}, // 2 x 8
-    {0, GL_RGB,                 GL_RGB,             GL_UNSIGNED_BYTE}, // 3 x 8
-    {0, GL_RGBA,                GL_RGBA,            GL_UNSIGNED_BYTE}, // 4 x 8
-    {0, 0,                      0,                  0},                // 1 x 16
-    {0, 0,                      0,                  0},                // 2 x 16
-    {0, 0,                      0,                  0},                // 3 x 16
-    {0, 0,                      0,                  0},                // 4 x 16
-};
-
-static const struct fmt_entry gl_byte_formats_legacy[] = {
-    {0, GL_LUMINANCE,           GL_LUMINANCE,       GL_UNSIGNED_BYTE}, // 1 x 8
-    {0, GL_LUMINANCE_ALPHA,     GL_LUMINANCE_ALPHA, GL_UNSIGNED_BYTE}, // 2 x 8
-    {0, GL_RGB,                 GL_RGB,             GL_UNSIGNED_BYTE}, // 3 x 8
-    {0, GL_RGBA,                GL_RGBA,            GL_UNSIGNED_BYTE}, // 4 x 8
-    {0, GL_LUMINANCE16,         GL_LUMINANCE,       GL_UNSIGNED_SHORT},// 1 x 16
-    {0, GL_LUMINANCE16_ALPHA16, GL_LUMINANCE_ALPHA, GL_UNSIGNED_SHORT},// 2 x 16
-    {0, GL_RGB16,               GL_RGB,             GL_UNSIGNED_SHORT},// 3 x 16
-    {0, GL_RGBA16,              GL_RGBA,            GL_UNSIGNED_SHORT},// 4 x 16
-};
-
-static const struct fmt_entry gl_float16_formats[] = {
-    {0, GL_R16F,    GL_RED,     GL_FLOAT},              // 1 x f
-    {0, GL_RG16F,   GL_RG,      GL_FLOAT},              // 2 x f
-    {0, GL_RGB16F,  GL_RGB,     GL_FLOAT},              // 3 x f
-    {0, GL_RGBA16F, GL_RGBA,    GL_FLOAT},              // 4 x f
-};
-
-static const struct fmt_entry gl_apple_formats[] = {
-    {IMGFMT_UYVY, GL_RGB, GL_RGB_422_APPLE, GL_UNSIGNED_SHORT_8_8_APPLE},
-    {IMGFMT_YUYV, GL_RGB, GL_RGB_422_APPLE, GL_UNSIGNED_SHORT_8_8_REV_APPLE},
-    {0}
+    bool broken_frame; // temporary error state
 };
 
 struct packed_fmt_entry {
@@ -359,6 +300,7 @@ static const struct packed_fmt_entry mp_packed_formats[] = {
 };
 
 const struct gl_video_opts gl_video_opts_def = {
+    .dither_algo = DITHER_FRUIT,
     .dither_depth = -1,
     .dither_size = 6,
     .temporal_dither_period = 1,
@@ -375,14 +317,16 @@ const struct gl_video_opts gl_video_opts_def = {
     .scaler_resizes_only = 1,
     .scaler_lut_size = 6,
     .interpolation_threshold = 0.0001,
-    .alpha_mode = 3,
+    .alpha_mode = ALPHA_BLEND_TILES,
     .background = {0, 0, 0, 255},
     .gamma = 1.0f,
-    .prescale_passes = 1,
-    .prescale_downscaling_threshold = 2.0f,
+    .target_brightness = 250,
+    .hdr_tone_mapping = TONE_MAPPING_HABLE,
+    .tone_mapping_param = NAN,
 };
 
 const struct gl_video_opts gl_video_opts_hq_def = {
+    .dither_algo = DITHER_FRUIT,
     .dither_depth = 0,
     .dither_size = 6,
     .temporal_dither_period = 1,
@@ -401,13 +345,13 @@ const struct gl_video_opts gl_video_opts_hq_def = {
     .scaler_resizes_only = 1,
     .scaler_lut_size = 6,
     .interpolation_threshold = 0.0001,
-    .alpha_mode = 3,
+    .alpha_mode = ALPHA_BLEND_TILES,
     .background = {0, 0, 0, 255},
     .gamma = 1.0f,
-    .blend_subs = 0,
     .deband = 1,
-    .prescale_passes = 1,
-    .prescale_downscaling_threshold = 2.0f,
+    .target_brightness = 250,
+    .hdr_tone_mapping = TONE_MAPPING_HABLE,
+    .tone_mapping_param = NAN,
 };
 
 static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
@@ -436,6 +380,14 @@ const struct m_sub_options gl_video_conf = {
         OPT_FLAG("gamma-auto", gamma_auto, 0),
         OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
         OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
+        OPT_INTRANGE("target-brightness", target_brightness, 0, 1, 100000),
+        OPT_CHOICE("hdr-tone-mapping", hdr_tone_mapping, 0,
+                   ({"clip",     TONE_MAPPING_CLIP},
+                    {"reinhard", TONE_MAPPING_REINHARD},
+                    {"hable",    TONE_MAPPING_HABLE},
+                    {"gamma",    TONE_MAPPING_GAMMA},
+                    {"linear",   TONE_MAPPING_LINEAR})),
+        OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
         OPT_FLAG("pbo", pbo, 0),
         SCALER_OPTS("scale",  SCALER_SCALE),
         SCALER_OPTS("dscale", SCALER_DSCALE),
@@ -449,9 +401,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
         OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
         OPT_CHOICE("fbo-format", fbo_format, 0,
-                   ({"rgb",    GL_RGB},
-                    {"rgba",   GL_RGBA},
-                    {"rgb8",   GL_RGB8},
+                   ({"rgb8",   GL_RGB8},
                     {"rgba8",  GL_RGBA8},
                     {"rgb10",  GL_RGB10},
                     {"rgb10_a2", GL_RGB10_A2},
@@ -466,42 +416,33 @@ const struct m_sub_options gl_video_conf = {
         OPT_CHOICE_OR_INT("dither-depth", dither_depth, 0, -1, 16,
                           ({"no", -1}, {"auto", 0})),
         OPT_CHOICE("dither", dither_algo, 0,
-                   ({"fruit", 0}, {"ordered", 1}, {"no", -1})),
+                   ({"fruit", DITHER_FRUIT},
+                    {"ordered", DITHER_ORDERED},
+                    {"no", DITHER_NONE})),
         OPT_INTRANGE("dither-size-fruit", dither_size, 0, 2, 8),
         OPT_FLAG("temporal-dither", temporal_dither, 0),
         OPT_INTRANGE("temporal-dither-period", temporal_dither_period, 0, 1, 128),
         OPT_CHOICE("alpha", alpha_mode, 0,
-                   ({"no", 0},
-                    {"yes", 1},
-                    {"blend", 2},
-                    {"blend-tiles", 3})),
+                   ({"no", ALPHA_NO},
+                    {"yes", ALPHA_YES},
+                    {"blend", ALPHA_BLEND},
+                    {"blend-tiles", ALPHA_BLEND_TILES})),
         OPT_FLAG("rectangle-textures", use_rectangle, 0),
         OPT_COLOR("background", background, 0),
         OPT_FLAG("interpolation", interpolation, 0),
         OPT_FLOAT("interpolation-threshold", interpolation_threshold, 0),
         OPT_CHOICE("blend-subtitles", blend_subs, 0,
-                   ({"no", 0},
-                    {"yes", 1},
-                    {"video", 2})),
+                   ({"no", BLEND_SUBS_NO},
+                    {"yes", BLEND_SUBS_YES},
+                    {"video", BLEND_SUBS_VIDEO})),
         OPT_STRING("scale-shader", scale_shader, 0),
         OPT_STRINGLIST("pre-shaders", pre_shaders, 0),
         OPT_STRINGLIST("post-shaders", post_shaders, 0),
+        OPT_STRINGLIST("user-shaders", user_shaders, 0),
         OPT_FLAG("deband", deband, 0),
         OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
         OPT_FLOAT("sharpen", unsharp, 0),
-        OPT_CHOICE("prescale-luma", prescale_luma, 0,
-                   ({"none", 0},
-                    {"superxbr", 1}
-#if HAVE_NNEDI
-                    , {"nnedi3", 2}
-#endif
-                    )),
-        OPT_INTRANGE("prescale-passes",
-                     prescale_passes, 0, 1, MAX_PRESCALE_PASSES),
-        OPT_FLOATRANGE("prescale-downscaling-threshold",
-                       prescale_downscaling_threshold, 0, 0.0, 32.0),
-        OPT_SUBSTRUCT("superxbr", superxbr_opts, superxbr_conf, 0),
-        OPT_SUBSTRUCT("nnedi3", nnedi3_opts, nnedi3_conf, 0),
+        OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
 
         OPT_REMOVED("approx-gamma", "this is always enabled now"),
         OPT_REMOVED("cscale-down", "chroma is never downscaled"),
@@ -509,6 +450,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_REMOVED("indirect", "this is set automatically whenever sane"),
         OPT_REMOVED("srgb", "use target-prim=bt709:target-trc=srgb instead"),
         OPT_REMOVED("source-shader", "use :deband to enable debanding"),
+        OPT_REMOVED("prescale-luma", "use user shaders for prescaling"),
 
         OPT_REPLACED("lscale", "scale"),
         OPT_REPLACED("lscale-down", "scale-down"),
@@ -524,7 +466,6 @@ const struct m_sub_options gl_video_conf = {
         OPT_REPLACED("smoothmotion-threshold", "tscale-param1"),
         OPT_REPLACED("scale-down", "dscale"),
         OPT_REPLACED("fancy-downscaling", "correct-downscaling"),
-        OPT_REPLACED("prescale", "prescale-luma"),
 
         {0}
     },
@@ -535,78 +476,44 @@ const struct m_sub_options gl_video_conf = {
 static void uninit_rendering(struct gl_video *p);
 static void uninit_scaler(struct gl_video *p, struct scaler *scaler);
 static void check_gl_features(struct gl_video *p);
-static bool init_format(int fmt, struct gl_video *init);
-static void gl_video_upload_image(struct gl_video *p, struct mp_image *mpi);
-static void assign_options(struct gl_video_opts *dst, struct gl_video_opts *src);
+static bool init_format(struct gl_video *p, int fmt, bool test_only);
+static void init_image_desc(struct gl_video *p, int fmt);
+static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi);
+static void set_options(struct gl_video *p, struct gl_video_opts *src);
+static const char *handle_scaler_opt(const char *name, bool tscale);
+static void reinit_from_options(struct gl_video *p);
 static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
+static void gl_video_setup_hooks(struct gl_video *p);
 
 #define GLSL(x) gl_sc_add(p->sc, #x "\n");
 #define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
 #define GLSLHF(...) gl_sc_haddf(p->sc, __VA_ARGS__)
 
-// Return a fixed point texture format with given characteristics.
-static const struct fmt_entry *find_tex_format(GL *gl, int bytes_per_comp,
-                                               int n_channels)
-{
-    assert(bytes_per_comp == 1 || bytes_per_comp == 2);
-    assert(n_channels >= 1 && n_channels <= 4);
-    const struct fmt_entry *fmts = gl_byte_formats;
-    if (gl->es >= 300) {
-        fmts = gl_byte_formats_gles3;
-    } else if (gl->es) {
-        fmts = gl_byte_formats_gles2;
-    } else if (!(gl->mpgl_caps & MPGL_CAP_TEX_RG)) {
-        fmts = gl_byte_formats_legacy;
-    }
-    return &fmts[n_channels - 1 + (bytes_per_comp - 1) * 4];
-}
-
-static bool is_integer_format(const struct fmt_entry *fmt)
-{
-    // Tests only the formats which we actually declare somewhere.
-    switch (fmt->format) {
-    case GL_RED_INTEGER:
-    case GL_RG_INTEGER:
-    case GL_RGB_INTEGER:
-    case GL_RGBA_INTEGER:
-        return true;
-    }
-    return false;
-}
-
-static const char *load_cached_file(struct gl_video *p, const char *path)
+static struct bstr load_cached_file(struct gl_video *p, const char *path)
 {
     if (!path || !path[0])
-        return NULL;
+        return (struct bstr){0};
     for (int n = 0; n < p->num_files; n++) {
         if (strcmp(p->files[n].path, path) == 0)
             return p->files[n].body;
     }
     // not found -> load it
-    if (p->num_files == MP_ARRAY_SIZE(p->files)) {
-        // empty cache when it overflows
-        for (int n = 0; n < p->num_files; n++) {
-            talloc_free(p->files[n].path);
-            talloc_free(p->files[n].body);
-        }
-        p->num_files = 0;
-    }
-    struct bstr s = stream_read_file(path, p, p->global, 100000); // 100 kB
+    struct bstr s = stream_read_file(path, p, p->global, 1024000); // 1024 kB
     if (s.len) {
-        struct cached_file *new = &p->files[p->num_files++];
-        *new = (struct cached_file) {
+        struct cached_file new = {
             .path = talloc_strdup(p, path),
-            .body = s.start
+            .body = s,
         };
-        return new->body;
+        MP_TARRAY_APPEND(p, p->files, p->num_files, new);
+        return new.body;
     }
-    return NULL;
+    return (struct bstr){0};
 }
 
 static void debug_check_gl(struct gl_video *p, const char *msg)
 {
     if (p->gl_debug)
-        glCheckError(p->gl, p->log, msg);
+        gl_check_error(p->gl, p->log, msg);
 }
 
 void gl_video_set_debug(struct gl_video *p, bool enable)
@@ -628,13 +535,23 @@ static void gl_video_reset_surfaces(struct gl_video *p)
     p->output_fbo_valid = false;
 }
 
+static void gl_video_reset_hooks(struct gl_video *p)
+{
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        if (p->tex_hooks[i].free)
+            p->tex_hooks[i].free(&p->tex_hooks[i]);
+    }
+
+    p->tex_hook_num = 0;
+}
+
 static inline int fbosurface_wrap(int id)
 {
     id = id % FBOSURFACES_MAX;
     return id < 0 ? id + FBOSURFACES_MAX : id;
 }
 
-static void recreate_osd(struct gl_video *p)
+static void reinit_osd(struct gl_video *p)
 {
     mpgl_osd_destroy(p->osd);
     p->osd = NULL;
@@ -644,17 +561,6 @@ static void recreate_osd(struct gl_video *p)
     }
 }
 
-static void reinit_rendering(struct gl_video *p)
-{
-    MP_VERBOSE(p, "Reinit rendering.\n");
-
-    debug_check_gl(p, "before scaler initialization");
-
-    uninit_rendering(p);
-
-    recreate_osd(p);
-}
-
 static void uninit_rendering(struct gl_video *p)
 {
     GL *gl = p->gl;
@@ -665,45 +571,41 @@ static void uninit_rendering(struct gl_video *p)
     gl->DeleteTextures(1, &p->dither_texture);
     p->dither_texture = 0;
 
-    gl->DeleteBuffers(1, &p->nnedi3_weights_buffer);
-    p->nnedi3_weights_buffer = 0;
-
     for (int n = 0; n < 4; n++) {
         fbotex_uninit(&p->merge_fbo[n]);
-        fbotex_uninit(&p->deband_fbo[n]);
         fbotex_uninit(&p->scale_fbo[n]);
         fbotex_uninit(&p->integer_fbo[n]);
     }
 
     fbotex_uninit(&p->indirect_fbo);
     fbotex_uninit(&p->blend_subs_fbo);
-    fbotex_uninit(&p->unsharp_fbo);
-
-    for (int n = 0; n < 2; n++) {
-        fbotex_uninit(&p->pre_fbo[n]);
-        fbotex_uninit(&p->post_fbo[n]);
-    }
-
-    for (int pass = 0; pass < MAX_PRESCALE_PASSES; pass++) {
-        for (int step = 0; step < MAX_PRESCALE_STEPS; step++)
-            fbotex_uninit(&p->prescale_fbo[pass][step]);
-    }
 
     for (int n = 0; n < FBOSURFACES_MAX; n++)
         fbotex_uninit(&p->surfaces[n].fbotex);
 
+    for (int n = 0; n < MAX_SAVED_TEXTURES; n++)
+        fbotex_uninit(&p->hook_fbos[n]);
+
+    for (int n = 0; n < 2; n++)
+        fbotex_uninit(&p->vdpau_deinterleave_fbo[n]);
+
     gl_video_reset_surfaces(p);
+    gl_video_reset_hooks(p);
+
+    gl_sc_reset_error(p->sc);
 }
 
-void gl_video_update_profile(struct gl_video *p)
+// Warning: profile.start must point to a ta allocation, and the function
+//          takes over ownership.
+void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data)
 {
-    if (p->use_lut_3d)
-        return;
-
-    p->use_lut_3d = true;
-    check_gl_features(p);
+    if (gl_lcms_set_memory_profile(p->cms, icc_data))
+        reinit_from_options(p);
+}
 
-    reinit_rendering(p);
+bool gl_video_icc_auto_enabled(struct gl_video *p)
+{
+    return p->opts.icc_opts ? p->opts.icc_opts->profile_auto : false;
 }
 
 static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
@@ -711,14 +613,15 @@ static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
 {
     GL *gl = p->gl;
 
-    if (!p->cms || !p->use_lut_3d)
+    if (!p->use_lut_3d)
         return false;
 
-    if (!gl_lcms_has_changed(p->cms, prim, trc))
+    if (p->lut_3d_texture && !gl_lcms_has_changed(p->cms, prim, trc))
         return true;
 
     struct lut3d *lut3d = NULL;
     if (!gl_lcms_get_lut3d(p->cms, &lut3d, prim, trc) || !lut3d) {
+        p->use_lut_3d = false;
         return false;
     }
 
@@ -738,12 +641,14 @@ static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
 
     debug_check_gl(p, "after 3d lut creation");
 
+    talloc_free(lut3d);
+
     return true;
 }
 
 // Fill an img_tex struct from an FBO + some metadata
-static struct img_tex img_tex_fbo(struct fbotex *fbo, struct gl_transform t,
-                                  enum plane_type type, int components)
+static struct img_tex img_tex_fbo(struct fbotex *fbo, enum plane_type type,
+                                  int components)
 {
     assert(type != PLANE_NONE);
     return (struct img_tex){
@@ -756,8 +661,7 @@ static struct img_tex img_tex_fbo(struct fbotex *fbo, struct gl_transform t,
         .tex_h = fbo->rh,
         .w = fbo->lw,
         .h = fbo->lh,
-        .pre_transform = identity_trans,
-        .transform = t,
+        .transform = identity_trans,
         .components = components,
     };
 }
@@ -797,18 +701,19 @@ static void get_plane_source_transform(struct gl_video *p, int w, int h,
 }
 
 // Places a video_image's image textures + associated metadata into tex[]. The
-// number of textures is equal to p->plane_count.
+// number of textures is equal to p->plane_count. Any necessary plane offsets
+// are stored in off. (e.g. chroma position)
 static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
-                             struct img_tex tex[4])
+                             struct img_tex tex[4], struct gl_transform off[4])
 {
     assert(vimg->mpi);
 
     // Determine the chroma offset
-    struct gl_transform chroma = (struct gl_transform){{{0}}};
-
     float ls_w = 1.0 / (1 << p->image_desc.chroma_xs);
     float ls_h = 1.0 / (1 << p->image_desc.chroma_ys);
 
+    struct gl_transform chroma = {{{ls_w, 0.0}, {0.0, ls_h}}};
+
     if (p->image_params.chroma_location != MP_CHROMA_CENTER) {
         int cx, cy;
         mp_get_chroma_location(p->image_params.chroma_location, &cx, &cy);
@@ -821,11 +726,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
         chroma.t[1] = ls_h < 1 ? ls_h * -cy / 2 : 0;
     }
 
-    // Make sure luma/chroma sizes are aligned.
-    // Example: For 4:2:0 with size 3x3, the subsampled chroma plane is 2x2
-    // so luma (3,3) has to align with chroma (2,2).
-    chroma.m[0][0] = ls_w * (float)vimg->planes[0].w / vimg->planes[1].w;
-    chroma.m[1][1] = ls_h * (float)vimg->planes[0].h / vimg->planes[1].h;
+    // FIXME: account for rotation in the chroma offset
 
     // The existing code assumes we just have a single tex multiplier for
     // all of the planes. This may change in the future
@@ -856,17 +757,18 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
             .gl_target = t->gl_target,
             .multiplier = tex_mul,
             .use_integer = t->use_integer,
-            .tex_w = t->w,
-            .tex_h = t->h,
+            .tex_w = t->tex_w,
+            .tex_h = t->tex_h,
             .w = t->w,
             .h = t->h,
-            .transform = type == PLANE_CHROMA ? chroma : identity_trans,
             .components = p->image_desc.components[n],
-            .texture_la = t->gl_format == GL_LUMINANCE_ALPHA,
         };
-        get_plane_source_transform(p, t->w, t->h, &tex[n].pre_transform);
+        snprintf(tex[n].swizzle, sizeof(tex[n].swizzle), "%s", t->swizzle);
+        get_plane_source_transform(p, t->w, t->h, &tex[n].transform);
         if (p->image_params.rotate % 180 == 90)
             MPSWAP(int, tex[n].w, tex[n].h);
+
+        off[n] = type == PLANE_CHROMA ? chroma : identity_trans;
     }
 }
 
@@ -874,19 +776,21 @@ static void init_video(struct gl_video *p)
 {
     GL *gl = p->gl;
 
-    init_format(p->image_params.imgfmt, p);
-    p->gl_target = p->opts.use_rectangle ? GL_TEXTURE_RECTANGLE : GL_TEXTURE_2D;
-
-    check_gl_features(p);
-
-    if (p->hwdec_active) {
+    if (p->hwdec && p->hwdec->driver->imgfmt == p->image_params.imgfmt) {
         if (p->hwdec->driver->reinit(p->hwdec, &p->image_params) < 0)
             MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
-        init_format(p->image_params.imgfmt, p);
-        p->image_params.imgfmt = p->image_desc.id;
-        p->gl_target = p->hwdec->gl_texture_target;
+        init_image_desc(p, p->image_params.imgfmt);
+        const char **exts = p->hwdec->glsl_extensions;
+        for (int n = 0; exts && exts[n]; n++)
+            gl_sc_enable_extension(p->sc, (char *)exts[n]);
+        p->hwdec_active = true;
+    } else {
+        init_format(p, p->image_params.imgfmt, false);
     }
 
+    // Format-dependent checks.
+    check_gl_features(p);
+
     mp_image_params_guess_csp(&p->image_params);
 
     int eq_caps = MP_CSP_EQ_CAPS_GAMMA;
@@ -900,42 +804,65 @@ static void init_video(struct gl_video *p)
 
     debug_check_gl(p, "before video texture creation");
 
-    struct video_image *vimg = &p->image;
+    if (!p->hwdec_active) {
+        struct video_image *vimg = &p->image;
 
-    struct mp_image layout = {0};
-    mp_image_set_params(&layout, &p->image_params);
+        GLenum gl_target =
+            p->opts.use_rectangle ? GL_TEXTURE_RECTANGLE : GL_TEXTURE_2D;
 
-    for (int n = 0; n < p->plane_count; n++) {
-        struct texplane *plane = &vimg->planes[n];
+        struct mp_image layout = {0};
+        mp_image_set_params(&layout, &p->image_params);
 
-        plane->gl_target = p->gl_target;
+        for (int n = 0; n < p->plane_count; n++) {
+            struct texplane *plane = &vimg->planes[n];
 
-        plane->w = mp_image_plane_w(&layout, n);
-        plane->h = mp_image_plane_h(&layout, n);
+            plane->gl_target = gl_target;
+
+            plane->w = plane->tex_w = mp_image_plane_w(&layout, n);
+            plane->h = plane->tex_h = mp_image_plane_h(&layout, n);
 
-        if (!p->hwdec_active) {
             gl->ActiveTexture(GL_TEXTURE0 + n);
             gl->GenTextures(1, &plane->gl_texture);
-            gl->BindTexture(p->gl_target, plane->gl_texture);
+            gl->BindTexture(gl_target, plane->gl_texture);
 
-            gl->TexImage2D(p->gl_target, 0, plane->gl_internal_format,
+            gl->TexImage2D(gl_target, 0, plane->gl_internal_format,
                            plane->w, plane->h, 0,
                            plane->gl_format, plane->gl_type, NULL);
 
             int filter = plane->use_integer ? GL_NEAREST : GL_LINEAR;
-            gl->TexParameteri(p->gl_target, GL_TEXTURE_MIN_FILTER, filter);
-            gl->TexParameteri(p->gl_target, GL_TEXTURE_MAG_FILTER, filter);
-            gl->TexParameteri(p->gl_target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-            gl->TexParameteri(p->gl_target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-        }
+            gl->TexParameteri(gl_target, GL_TEXTURE_MIN_FILTER, filter);
+            gl->TexParameteri(gl_target, GL_TEXTURE_MAG_FILTER, filter);
+            gl->TexParameteri(gl_target, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
+            gl->TexParameteri(gl_target, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
-        MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n, plane->w, plane->h);
+            MP_VERBOSE(p, "Texture for plane %d: %dx%d\n", n, plane->w, plane->h);
+        }
+        gl->ActiveTexture(GL_TEXTURE0);
     }
-    gl->ActiveTexture(GL_TEXTURE0);
 
     debug_check_gl(p, "after video texture creation");
 
-    reinit_rendering(p);
+    gl_video_setup_hooks(p);
+}
+
+// Release any texture mappings associated with the current frame.
+static void unmap_current_image(struct gl_video *p)
+{
+    struct video_image *vimg = &p->image;
+
+    if (vimg->hwdec_mapped) {
+        assert(p->hwdec_active);
+        if (p->hwdec->driver->unmap)
+            p->hwdec->driver->unmap(p->hwdec);
+        memset(vimg->planes, 0, sizeof(vimg->planes));
+        vimg->hwdec_mapped = false;
+    }
+}
+
+static void unref_current_image(struct gl_video *p)
+{
+    unmap_current_image(p);
+    mp_image_unrefp(&p->image.mpi);
 }
 
 static void uninit_video(struct gl_video *p)
@@ -946,21 +873,21 @@ static void uninit_video(struct gl_video *p)
 
     struct video_image *vimg = &p->image;
 
+    unref_current_image(p);
+
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *plane = &vimg->planes[n];
 
-        if (!p->hwdec_active)
-            gl->DeleteTextures(1, &plane->gl_texture);
-        plane->gl_texture = 0;
+        gl->DeleteTextures(1, &plane->gl_texture);
         gl->DeleteBuffers(1, &plane->gl_buffer);
-        plane->gl_buffer = 0;
     }
-    mp_image_unrefp(&vimg->mpi);
+    *vimg = (struct video_image){0};
 
     // Invalidate image_params to ensure that gl_video_config() will call
     // init_video() on uninitialized gl_video.
     p->real_image_params = (struct mp_image_params){0};
     p->image_params = p->real_image_params;
+    p->hwdec_active = false;
 }
 
 static void pass_prepare_src_tex(struct gl_video *p)
@@ -975,9 +902,11 @@ static void pass_prepare_src_tex(struct gl_video *p)
 
         char texture_name[32];
         char texture_size[32];
+        char texture_rot[32];
         char pixel_size[32];
         snprintf(texture_name, sizeof(texture_name), "texture%d", n);
         snprintf(texture_size, sizeof(texture_size), "texture_size%d", n);
+        snprintf(texture_rot, sizeof(texture_rot), "texture_rot%d", n);
         snprintf(pixel_size, sizeof(pixel_size), "pixel_size%d", n);
 
         if (s->use_integer) {
@@ -991,6 +920,7 @@ static void pass_prepare_src_tex(struct gl_video *p)
             f[1] = s->tex_h;
         }
         gl_sc_uniform_vec2(sc, texture_size, f);
+        gl_sc_uniform_mat2(sc, texture_rot, true, (float *)s->transform.m);
         gl_sc_uniform_vec2(sc, pixel_size, (GLfloat[]){1.0f / f[0],
                                                        1.0f / f[1]});
 
@@ -1022,7 +952,6 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
             if (!s->gl_tex)
                 continue;
             struct gl_transform tr = s->transform;
-            gl_transform_trans(s->pre_transform, &tr);
             float tx = (n / 2) * s->w;
             float ty = (n % 2) * s->h;
             gl_transform_vec(tr, &tx, &ty);
@@ -1038,7 +967,6 @@ static void render_pass_quad(struct gl_video *p, int vp_w, int vp_h,
     debug_check_gl(p, "after rendering");
 }
 
-// flags: see render_pass_quad
 static void finish_pass_direct(struct gl_video *p, GLint fbo, int vp_w, int vp_h,
                                const struct mp_rect *dst)
 {
@@ -1067,6 +995,34 @@ static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
                        &(struct mp_rect){0, 0, w, h});
 }
 
+// Copy a texture to the vec4 color, while increasing offset. Also applies
+// the texture multiplier to the sampled color
+static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
+{
+    int count = img.components;
+    assert(*offset + count <= 4);
+
+    int id = pass_bind(p, img);
+    char src[5] = {0};
+    char dst[5] = {0};
+    const char *tex_fmt = img.swizzle[0] ? img.swizzle : "rgba";
+    const char *dst_fmt = "rgba";
+    for (int i = 0; i < count; i++) {
+        src[i] = tex_fmt[i];
+        dst[i] = dst_fmt[*offset + i];
+    }
+
+    if (img.use_integer) {
+        uint64_t tex_max = 1ull << p->image_desc.component_full_bits;
+        img.multiplier *= 1.0 / (tex_max - 1);
+    }
+
+    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
+          dst, img.multiplier, id, id, src);
+
+    *offset += count;
+}
+
 static void skip_unused(struct gl_video *p, int num_components)
 {
     for (int i = num_components; i < 4; i++)
@@ -1083,9 +1039,202 @@ static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
     scaler->initialized = false;
 }
 
-static void load_shader(struct gl_video *p, const char *body)
+static void hook_prelude(struct gl_video *p, const char *name, int id,
+                         struct img_tex tex)
 {
-    gl_sc_hadd(p->sc, body);
+    GLSLHF("#define %s_raw texture%d\n", name, id);
+    GLSLHF("#define %s_pos texcoord%d\n", name, id);
+    GLSLHF("#define %s_size texture_size%d\n", name, id);
+    GLSLHF("#define %s_rot texture_rot%d\n", name, id);
+    GLSLHF("#define %s_pt pixel_size%d\n", name, id);
+
+    // Set up the sampling functions
+    GLSLHF("#define %s_tex(pos) (%f * vec4(texture(%s_raw, pos)).%s)\n",
+           name, tex.multiplier, name, tex.swizzle[0] ? tex.swizzle : "rgba");
+
+    // Since the extra matrix multiplication impacts performance,
+    // skip it unless the texture was actually rotated
+    if (gl_transform_eq(tex.transform, identity_trans)) {
+        GLSLHF("#define %s_texOff(off) %s_tex(%s_pos + %s_pt * vec2(off))\n",
+               name, name, name, name);
+    } else {
+        GLSLHF("#define %s_texOff(off) "
+                   "%s_tex(%s_pos + %s_rot * vec2(off)/%s_size)\n",
+               name, name, name, name, name);
+    }
+}
+
+static bool saved_tex_find(struct gl_video *p, const char *name,
+                           struct img_tex *out)
+{
+    if (!name || !out)
+        return false;
+
+    for (int i = 0; i < p->saved_tex_num; i++) {
+        if (strcmp(p->saved_tex[i].name, name) == 0) {
+            *out = p->saved_tex[i].tex;
+            return true;
+        }
+    }
+
+    return false;
+}
+
+static void saved_tex_store(struct gl_video *p, const char *name,
+                            struct img_tex tex)
+{
+    assert(name);
+
+    for (int i = 0; i < p->saved_tex_num; i++) {
+        if (strcmp(p->saved_tex[i].name, name) == 0) {
+            p->saved_tex[i].tex = tex;
+            return;
+        }
+    }
+
+    assert(p->saved_tex_num < MAX_SAVED_TEXTURES);
+    p->saved_tex[p->saved_tex_num++] = (struct saved_tex) {
+        .name = name,
+        .tex = tex
+    };
+}
+
+// Process hooks for a plane, saving the result and returning a new img_tex
+// If 'trans' is NULL, the shader is forbidden from transforming tex
+static struct img_tex pass_hook(struct gl_video *p, const char *name,
+                                struct img_tex tex, struct gl_transform *trans)
+{
+    if (!name)
+        return tex;
+
+    saved_tex_store(p, name, tex);
+
+    MP_DBG(p, "Running hooks for %s\n", name);
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        struct tex_hook *hook = &p->tex_hooks[i];
+
+        if (strcmp(hook->hook_tex, name) != 0)
+            continue;
+
+        // Check the hook's condition
+        if (hook->cond && !hook->cond(p, tex, hook->priv)) {
+            MP_DBG(p, "Skipping hook on %s due to condition.\n", name);
+            continue;
+        }
+
+        // Bind all necessary textures and add them to the prelude
+        for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
+            const char *bind_name = hook->bind_tex[t];
+            struct img_tex bind_tex;
+
+            if (!bind_name)
+                continue;
+
+            // This is a special name that means "currently hooked texture"
+            if (strcmp(bind_name, "HOOKED") == 0) {
+                int id = pass_bind(p, tex);
+                hook_prelude(p, "HOOKED", id, tex);
+                hook_prelude(p, name, id, tex);
+                continue;
+            }
+
+            if (!saved_tex_find(p, bind_name, &bind_tex)) {
+                // Clean up texture bindings and move on to the next hook
+                MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
+                       name, bind_name);
+                p->pass_tex_num -= t;
+                goto next_hook;
+            }
+
+            hook_prelude(p, bind_name, pass_bind(p, bind_tex), bind_tex);
+        }
+
+        // Run the actual hook. This generates a series of GLSL shader
+        // instructions sufficient for drawing the hook's output
+        struct gl_transform hook_off = identity_trans;
+        hook->hook(p, tex, &hook_off, hook->priv);
+
+        int comps = hook->components ? hook->components : tex.components;
+        skip_unused(p, comps);
+
+        // Compute the updated FBO dimensions and store the result
+        struct mp_rect_f sz = {0, 0, tex.w, tex.h};
+        gl_transform_rect(hook_off, &sz);
+        int w = lroundf(fabs(sz.x1 - sz.x0));
+        int h = lroundf(fabs(sz.y1 - sz.y0));
+
+        assert(p->hook_fbo_num < MAX_SAVED_TEXTURES);
+        struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
+        finish_pass_fbo(p, fbo, w, h, 0);
+
+        const char *store_name = hook->save_tex ? hook->save_tex : name;
+        struct img_tex saved_tex = img_tex_fbo(fbo, tex.type, comps);
+
+        // If the texture we're saving overwrites the "current" texture, also
+        // update the tex parameter so that the future loop cycles will use the
+        // updated values, and export the offset
+        if (strcmp(store_name, name) == 0) {
+            if (!trans && !gl_transform_eq(hook_off, identity_trans)) {
+                MP_ERR(p, "Hook tried changing size of unscalable texture %s!\n",
+                       name);
+                return tex;
+            }
+
+            tex = saved_tex;
+            if (trans)
+                gl_transform_trans(hook_off, trans);
+        }
+
+        saved_tex_store(p, store_name, saved_tex);
+
+next_hook: ;
+    }
+
+    return tex;
+}
+
+// This can be used at any time in the middle of rendering to specify an
+// optional hook point, which if triggered will render out to a new FBO and
+// load the result back into vec4 color. Offsets applied by the hooks are
+// accumulated in tex_trans, and the FBO is dimensioned according
+// to p->texture_w/h
+static void pass_opt_hook_point(struct gl_video *p, const char *name,
+                                struct gl_transform *tex_trans)
+{
+    if (!name)
+        return;
+
+    for (int i = 0; i < p->tex_hook_num; i++) {
+        struct tex_hook *hook = &p->tex_hooks[i];
+
+        if (strcmp(hook->hook_tex, name) == 0)
+            goto found;
+
+        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
+            if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
+                goto found;
+        }
+    }
+
+    // Nothing uses this texture, don't bother storing it
+    return;
+
+found:
+    assert(p->hook_fbo_num < MAX_SAVED_TEXTURES);
+    struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
+    finish_pass_fbo(p, fbo, p->texture_w, p->texture_h, 0);
+
+    struct img_tex img = img_tex_fbo(fbo, PLANE_RGB, p->components);
+    img = pass_hook(p, name, img, tex_trans);
+    copy_img_tex(p, &(int){0}, img);
+    p->texture_w = img.w;
+    p->texture_h = img.h;
+    p->components = img.components;
+}
+
+static void load_shader(struct gl_video *p, struct bstr body)
+{
+    gl_sc_hadd_bstr(p->sc, body);
     gl_sc_uniform_f(p->sc, "random", (double)av_lfg_get(&p->lfg) / UINT32_MAX);
     gl_sc_uniform_f(p->sc, "frame", p->frames_uploaded);
     gl_sc_uniform_vec2(p->sc, "image_size", (GLfloat[]){p->image_params.w,
@@ -1105,34 +1254,6 @@ static const char *get_custom_shader_fn(struct gl_video *p, const char *body)
     return "sample_pixel";
 }
 
-// Applies an arbitrary number of shaders in sequence, using the given pair
-// of FBOs as intermediate buffers. Returns whether any shaders were applied.
-static bool apply_shaders(struct gl_video *p, char **shaders, int w, int h,
-                          struct fbotex textures[2])
-{
-    if (!shaders)
-        return false;
-    bool success = false;
-    int tex = 0;
-    for (int n = 0; shaders[n]; n++) {
-        const char *body = load_cached_file(p, shaders[n]);
-        if (!body)
-            continue;
-        finish_pass_fbo(p, &textures[tex], w, h, 0);
-        int id = pass_bind(p, img_tex_fbo(&textures[tex], identity_trans,
-                                          PLANE_RGB, p->components));
-        GLSLHF("#define pixel_size pixel_size%d\n", id);
-        load_shader(p, body);
-        const char *fn_name = get_custom_shader_fn(p, body);
-        GLSLF("// custom shader\n");
-        GLSLF("color = %s(texture%d, texcoord%d, texture_size%d);\n",
-              fn_name, id, id, id);
-        tex = (tex+1) % 2;
-        success = true;
-    }
-    return success;
-}
-
 // Semantic equality
 static bool double_seq(double a, double b)
 {
@@ -1175,6 +1296,9 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
     uninit_scaler(p, scaler);
 
     scaler->conf = *conf;
+    bool is_tscale = scaler->index == SCALER_TSCALE;
+    scaler->conf.kernel.name = (char *)handle_scaler_opt(conf->kernel.name, is_tscale);
+    scaler->conf.window.name = (char *)handle_scaler_opt(conf->window.name, is_tscale);
     scaler->scale_factor = scale_factor;
     scaler->insufficient = false;
     scaler->initialized = true;
@@ -1229,7 +1353,7 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
     }
     int width = size / elems_per_pixel;
     assert(size == width * elems_per_pixel);
-    const struct fmt_entry *fmt = &gl_float16_formats[elems_per_pixel - 1];
+    const struct gl_format *fmt = gl_find_float16_format(gl, elems_per_pixel);
     GLenum target = scaler->gl_target;
 
     gl->ActiveTexture(GL_TEXTURE0 + TEXUNIT_SCALERS + scaler->index);
@@ -1288,7 +1412,8 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
     finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
 
     // Second pass (scale only in the x dir)
-    src = img_tex_fbo(&scaler->sep_fbo, t_x, src.type, src.components);
+    src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
+    src.transform = t_x;
     sampler_prelude(p->sc, pass_bind(p, src));
     GLSLF("// pass 2\n");
     pass_sample_separated_gen(p->sc, scaler, 1, 0);
@@ -1322,10 +1447,10 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     } else if (strcmp(name, "oversample") == 0) {
         pass_sample_oversample(p->sc, scaler, w, h);
     } else if (strcmp(name, "custom") == 0) {
-        const char *body = load_cached_file(p, p->opts.scale_shader);
-        if (body) {
+        struct bstr body = load_cached_file(p, p->opts.scale_shader);
+        if (body.start) {
             load_shader(p, body);
-            const char *fn_name = get_custom_shader_fn(p, body);
+            const char *fn_name = get_custom_shader_fn(p, body.start);
             GLSLF("// custom scale-shader\n");
             GLSLF("color = %s(tex, pos, size);\n", fn_name);
         } else {
@@ -1349,316 +1474,474 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     skip_unused(p, tex.components);
 }
 
-// Get the number of passes for prescaler, with given display size.
-static int get_prescale_passes(struct gl_video *p, struct img_tex tex[4])
+// Returns true if two img_texs are semantically equivalent (same metadata)
+static bool img_tex_equiv(struct img_tex a, struct img_tex b)
 {
-    if (!p->opts.prescale_luma)
-        return 0;
+    return a.type == b.type &&
+           a.components == b.components &&
+           a.multiplier == b.multiplier &&
+           a.gl_target == b.gl_target &&
+           a.use_integer == b.use_integer &&
+           a.tex_w == b.tex_w &&
+           a.tex_h == b.tex_h &&
+           a.w == b.w &&
+           a.h == b.h &&
+           gl_transform_eq(a.transform, b.transform) &&
+           strcmp(a.swizzle, b.swizzle) == 0;
+}
 
-    // Return 0 if no luma planes exist
-    for (int n = 0; ; n++) {
-        if (n > 4)
-            return 0;
+static void pass_add_hook(struct gl_video *p, struct tex_hook hook)
+{
+    if (p->tex_hook_num < MAX_TEXTURE_HOOKS) {
+        p->tex_hooks[p->tex_hook_num++] = hook;
+    } else {
+        MP_ERR(p, "Too many hooks! Limit is %d.\n", MAX_TEXTURE_HOOKS);
 
-        if (tex[n].type == PLANE_LUMA)
-            break;
+        if (hook.free)
+            hook.free(&hook);
     }
+}
 
-    // The downscaling threshold check is turned off.
-    if (p->opts.prescale_downscaling_threshold < 1.0f)
-        return p->opts.prescale_passes;
+// Adds a hook multiple times, one per name. The last name must be NULL to
+// signal the end of the argument list.
+#define HOOKS(...) ((char*[]){__VA_ARGS__, NULL})
+static void pass_add_hooks(struct gl_video *p, struct tex_hook hook,
+                           char **names)
+{
+    for (int i = 0; names[i] != NULL; i++) {
+        hook.hook_tex = names[i];
+        pass_add_hook(p, hook);
+    }
+}
 
-    double scale_factors[2];
-    get_scale_factors(p, true, scale_factors);
+static void deband_hook(struct gl_video *p, struct img_tex tex,
+                        struct gl_transform *trans, void *priv)
+{
+    pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg);
+}
 
-    int passes = 0;
-    for (; passes < p->opts.prescale_passes; passes ++) {
-        // The scale factor happens to be the same for superxbr and nnedi3.
-        scale_factors[0] /= 2;
-        scale_factors[1] /= 2;
+static void unsharp_hook(struct gl_video *p, struct img_tex tex,
+                         struct gl_transform *trans, void *priv)
+{
+    GLSLF("#define tex HOOKED\n");
+    GLSLF("#define pos HOOKED_pos\n");
+    GLSLF("#define pt HOOKED_pt\n");
+    pass_sample_unsharp(p->sc, p->opts.unsharp);
+}
 
-        if (1.0f / scale_factors[0] > p->opts.prescale_downscaling_threshold)
-            break;
-        if (1.0f / scale_factors[1] > p->opts.prescale_downscaling_threshold)
-            break;
-    }
+static void user_hook_old(struct gl_video *p, struct img_tex tex,
+                          struct gl_transform *trans, void *priv)
+{
+    const char *body = priv;
+    assert(body);
 
-    return passes;
+    GLSLHF("#define pixel_size HOOKED_pt\n");
+    load_shader(p, bstr0(body));
+    const char *fn_name = get_custom_shader_fn(p, body);
+    GLSLF("// custom shader\n");
+    GLSLF("color = %s(HOOKED_raw, HOOKED_pos, HOOKED_size);\n", fn_name);
 }
 
-// Upload the NNEDI3 UBO weights only if needed
-static void upload_nnedi3_weights(struct gl_video *p)
+// Returns whether successful. 'result' is left untouched on failure
+static bool eval_szexpr(struct gl_video *p, struct img_tex tex,
+                        struct szexp expr[MAX_SZEXP_SIZE],
+                        float *result)
 {
-    GL *gl = p->gl;
+    float stack[MAX_SZEXP_SIZE] = {0};
+    int idx = 0; // points to next element to push
 
-    if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO &&
-        !p->nnedi3_weights_buffer)
-    {
-        gl->GenBuffers(1, &p->nnedi3_weights_buffer);
-        gl->BindBufferBase(GL_UNIFORM_BUFFER, 0, p->nnedi3_weights_buffer);
+    for (int i = 0; i < MAX_SZEXP_SIZE; i++) {
+        switch (expr[i].tag) {
+        case SZEXP_END:
+            goto done;
 
-        int size;
-        const float *weights = get_nnedi3_weights(p->opts.nnedi3_opts, &size);
+        case SZEXP_CONST:
+            // Since our SZEXPs are bound by MAX_SZEXP_SIZE, it should be
+            // impossible to overflow the stack
+            assert(idx < MAX_SZEXP_SIZE);
+            stack[idx++] = expr[i].val.cval;
+            continue;
 
-        MP_VERBOSE(p, "Uploading NNEDI3 weights via UBO (size=%d)\n", size);
+        case SZEXP_OP1:
+            if (idx < 1) {
+                MP_WARN(p, "Stack underflow in RPN expression!\n");
+                return false;
+            }
 
-        // We don't know the endianness of GPU, just assume it's LE
-        gl->BufferData(GL_UNIFORM_BUFFER, size, weights, GL_STATIC_DRAW);
-    }
-}
+            switch (expr[i].val.op) {
+            case SZEXP_OP_NOT: stack[idx-1] = !stack[idx-1]; break;
+            default: abort();
+            }
+            continue;
 
-// Applies a single pass of the prescaler, and accumulates the offset in
-// pass_transform.
-static void pass_prescale_luma(struct gl_video *p, struct img_tex *tex,
-                               struct gl_transform *pass_transform,
-                               struct fbotex fbo[MAX_PRESCALE_STEPS])
-{
-    // Happens to be the same for superxbr and nnedi3.
-    const int num_steps = 2;
+        case SZEXP_OP2:
+            if (idx < 2) {
+                MP_WARN(p, "Stack underflow in RPN expression!\n");
+                return false;
+            }
 
-    for (int step = 0; step < num_steps; step++) {
-        struct gl_transform step_transform = {{{0}}};
-        int id = pass_bind(p, *tex);
-        int planes = tex->components;
+            // Pop the operands in reverse order
+            float op2 = stack[--idx];
+            float op1 = stack[--idx];
+            float res = 0.0;
+            switch (expr[i].val.op) {
+            case SZEXP_OP_ADD: res = op1 + op2; break;
+            case SZEXP_OP_SUB: res = op1 - op2; break;
+            case SZEXP_OP_MUL: res = op1 * op2; break;
+            case SZEXP_OP_DIV: res = op1 / op2; break;
+            case SZEXP_OP_GT:  res = op1 > op2; break;
+            case SZEXP_OP_LT:  res = op1 < op2; break;
+            default: abort();
+            }
 
-        switch(p->opts.prescale_luma) {
-        case 1:
-            assert(planes == 1);
-            pass_superxbr(p->sc, id, step, tex->multiplier,
-                          p->opts.superxbr_opts, &step_transform);
-            break;
-        case 2:
-            upload_nnedi3_weights(p);
-            pass_nnedi3(p->gl, p->sc, planes, id, step, tex->multiplier,
-                        p->opts.nnedi3_opts, &step_transform, tex->gl_target);
-            break;
-        default:
-            abort();
-        }
+            if (!isfinite(res)) {
+                MP_WARN(p, "Illegal operation in RPN expression!\n");
+                return false;
+            }
 
-        int new_w = tex->w * (int)step_transform.m[0][0],
-            new_h = tex->h * (int)step_transform.m[1][1];
+            stack[idx++] = res;
+            continue;
 
-        skip_unused(p, planes);
-        finish_pass_fbo(p, &fbo[step], new_w, new_h, 0);
-        *tex = img_tex_fbo(&fbo[step], identity_trans, tex->type, tex->components);
+        case SZEXP_VAR_W:
+        case SZEXP_VAR_H: {
+            struct bstr name = expr[i].val.varname;
+            struct img_tex var_tex;
+
+            // The size of OUTPUT is determined. It could be useful for certain
+            // user shaders to skip passes.
+            if (bstr_equals0(name, "OUTPUT")) {
+                int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
+                int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
+                stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? vp_w : vp_h;
+                continue;
+            }
 
-        // Accumulate the local transform
-        gl_transform_trans(step_transform, pass_transform);
-    }
-}
+            // HOOKED is a special case
+            if (bstr_equals0(name, "HOOKED")) {
+                var_tex = tex;
+                goto found_tex;
+            }
 
-// Copy a texture to the vec4 color, while increasing offset. Also applies
-// the texture multiplier to the sampled color
-static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
-{
-    int count = img.components;
-    assert(*offset + count <= 4);
+            for (int o = 0; o < p->saved_tex_num; o++) {
+                if (bstr_equals0(name, p->saved_tex[o].name)) {
+                    var_tex = p->saved_tex[o].tex;
+                    goto found_tex;
+                }
+            }
 
-    int id = pass_bind(p, img);
-    char src[5] = {0};
-    char dst[5] = {0};
-    const char *tex_fmt = img.texture_la ? "ragg" : "rgba";
-    const char *dst_fmt = "rgba";
-    for (int i = 0; i < count; i++) {
-        src[i] = tex_fmt[i];
-        dst[i] = dst_fmt[*offset + i];
-    }
+            MP_WARN(p, "Texture %.*s not found in RPN expression!\n", BSTR_P(name));
+            return false;
 
-    if (img.use_integer) {
-        uint64_t tex_max = 1ull << p->image_desc.component_full_bits;
-        img.multiplier *= 1.0 / (tex_max - 1);
+found_tex:
+            stack[idx++] = (expr[i].tag == SZEXP_VAR_W) ? var_tex.w : var_tex.h;
+            continue;
+            }
+        }
     }
 
-    GLSLF("color.%s = %f * vec4(texture(texture%d, texcoord%d)).%s;\n",
-          dst, img.multiplier, id, id, src);
+done:
+    // Return the single stack element
+    if (idx != 1) {
+        MP_WARN(p, "Malformed stack after RPN expression!\n");
+        return false;
+    }
 
-    *offset += count;
+    *result = stack[0];
+    return true;
 }
 
-// sample from video textures, set "color" variable to yuv value
-static void pass_read_video(struct gl_video *p)
+static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
 {
-    struct img_tex tex[4];
-    pass_get_img_tex(p, &p->image, tex);
+    struct gl_user_shader *shader = priv;
+    assert(shader);
 
-    // Most of the steps here don't actually apply image transformations yet,
-    // save for the actual upscaling - so as a code convenience we store them
-    // separately
-    struct gl_transform transforms[4];
-    struct gl_transform tex_trans = identity_trans;
-    for (int i = 0; i < 4; i++) {
-        transforms[i] = tex[i].transform;
-        tex[i].transform = identity_trans;
-    }
+    float res = false;
+    eval_szexpr(p, tex, shader->cond, &res);
+    return res;
+}
 
-    int prescale_passes = get_prescale_passes(p, tex);
+static void user_hook(struct gl_video *p, struct img_tex tex,
+                      struct gl_transform *trans, void *priv)
+{
+    struct gl_user_shader *shader = priv;
+    assert(shader);
 
-    int dst_w = p->texture_w << prescale_passes,
-        dst_h = p->texture_h << prescale_passes;
+    load_shader(p, shader->pass_body);
+    GLSLF("// custom hook\n");
+    GLSLF("color = hook();\n");
 
-    bool needs_deband[4];
-    int scaler_id[4]; // ID if needed, -1 otherwise
-    int needs_prescale[4]; // number of prescaling passes left
+    // Make sure we at least create a legal FBO on failure, since it's better
+    // to do this and display an error message than just crash OpenGL
+    float w = 1.0, h = 1.0;
 
-    // Determine what needs to be done for which plane
-    for (int i=0; i < 4; i++) {
-        enum plane_type type = tex[i].type;
-        if (type == PLANE_NONE) {
-            needs_deband[i] = false;
-            needs_prescale[i] = 0;
-            scaler_id[i] = -1;
-            continue;
-        }
+    eval_szexpr(p, tex, shader->width, &w);
+    eval_szexpr(p, tex, shader->height, &h);
 
-        needs_deband[i] = type != PLANE_ALPHA ? p->opts.deband : false;
-        needs_prescale[i] = type == PLANE_LUMA ? prescale_passes : 0;
+    *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
+    gl_transform_trans(shader->offset, trans);
+}
 
-        scaler_id[i] = -1;
-        switch (type) {
-        case PLANE_RGB:
-        case PLANE_LUMA:
-        case PLANE_XYZ:
-            scaler_id[i] = SCALER_SCALE;
-            break;
+static void user_hook_free(struct tex_hook *hook)
+{
+    talloc_free(hook->hook_tex);
+    talloc_free(hook->save_tex);
+    for (int i = 0; i < TEXUNIT_VIDEO_NUM; i++)
+        talloc_free(hook->bind_tex[i]);
+    talloc_free(hook->priv);
+}
 
-        case PLANE_CHROMA:
-            scaler_id[i] = SCALER_CSCALE;
-            break;
+static void pass_hook_user_shaders_old(struct gl_video *p, char *name,
+                                       char **shaders)
+{
+    assert(name);
+    if (!shaders)
+        return;
 
-        case PLANE_ALPHA: // always use bilinear for alpha
-        default:
-            continue;
+    for (int n = 0; shaders[n] != NULL; n++) {
+        char *body = load_cached_file(p, shaders[n]).start;
+        if (body) {
+            pass_add_hook(p, (struct tex_hook) {
+                .hook_tex = name,
+                .bind_tex = {"HOOKED"},
+                .hook = user_hook_old,
+                .priv = body,
+            });
         }
-
-        // We can skip scaling if the texture is already at the required size
-        if (tex[i].w == dst_w && tex[i].h == dst_h)
-            scaler_id[i] = -1;
     }
+}
 
-    // Process all the planes that need some action performed
-    while (true) {
-        // Find next plane to operate on
-        int n = -1;
-        for (int i = 0; i < 4; i++) {
-            if (tex[i].type != PLANE_NONE &&
-                (scaler_id[i] >= 0 || needs_deband[i] || needs_prescale[i]))
-            {
-                n = i;
-                break;
+static void pass_hook_user_shaders(struct gl_video *p, char **shaders)
+{
+    if (!shaders)
+        return;
+
+    for (int n = 0; shaders[n] != NULL; n++) {
+        struct bstr file = load_cached_file(p, shaders[n]);
+        struct gl_user_shader out;
+        while (parse_user_shader_pass(p->log, &file, &out)) {
+            struct tex_hook hook = {
+                .components = out.components,
+                .hook = user_hook,
+                .free = user_hook_free,
+                .cond = user_hook_cond,
+            };
+
+            for (int i = 0; i < SHADER_MAX_HOOKS; i++) {
+                hook.hook_tex = bstrdup0(p, out.hook_tex[i]);
+                if (!hook.hook_tex)
+                    continue;
+
+                struct gl_user_shader *out_copy = talloc_ptrtype(p, out_copy);
+                *out_copy = out;
+                hook.priv = out_copy;
+                for (int o = 0; o < SHADER_MAX_BINDS; o++)
+                    hook.bind_tex[o] = bstrdup0(p, out.bind_tex[o]);
+                hook.save_tex = bstrdup0(p, out.save_tex),
+                pass_add_hook(p, hook);
             }
         }
+    }
+}
 
-        if (n == -1) // no textures left
-            break;
+static void gl_video_setup_hooks(struct gl_video *p)
+{
+    gl_video_reset_hooks(p);
+
+    if (p->opts.deband) {
+        pass_add_hooks(p, (struct tex_hook) {.hook = deband_hook,
+                                             .bind_tex = {"HOOKED"}},
+                       HOOKS("LUMA", "CHROMA", "RGB", "XYZ"));
+    }
+
+    if (p->opts.unsharp != 0.0) {
+        pass_add_hook(p, (struct tex_hook) {
+            .hook_tex = "MAIN",
+            .bind_tex = {"HOOKED"},
+            .hook = unsharp_hook,
+        });
+    }
+
+    pass_hook_user_shaders_old(p, "MAIN", p->opts.pre_shaders);
+    pass_hook_user_shaders_old(p, "SCALED", p->opts.post_shaders);
+    pass_hook_user_shaders(p, p->opts.user_shaders);
+}
+
+// sample from video textures, set "color" variable to yuv value
+static void pass_read_video(struct gl_video *p)
+{
+    struct img_tex tex[4];
+    struct gl_transform offsets[4];
+    pass_get_img_tex(p, &p->image, tex, offsets);
+
+    // To keep the code as simple as possibly, we currently run all shader
+    // stages even if they would be unnecessary (e.g. no hooks for a texture).
+    // In the future, deferred img_tex should optimize this away.
+
+    // Merge semantically identical textures. This loop is done from back
+    // to front so that merged textures end up in the right order while
+    // simultaneously allowing us to skip unnecessary merges
+    for (int n = 3; n >= 0; n--) {
+        if (tex[n].type == PLANE_NONE)
+            continue;
 
-        // Figure out if it needs to be merged with anything else first
-        int o = -1;
-        for (int i = n+1; i < 4; i++) {
-            if (tex[i].type == tex[n].type
-                && tex[i].w == tex[n].w
-                && tex[i].h == tex[n].h
-                && gl_transform_eq(transforms[i], transforms[n]))
+        int first = n;
+        int num = 0;
+
+        for (int i = 0; i < n; i++) {
+            if (img_tex_equiv(tex[n], tex[i]) &&
+                gl_transform_eq(offsets[n], offsets[i]))
             {
-                o = i;
-                break;
+                GLSLF("// merging plane %d ...\n", i);
+                copy_img_tex(p, &num, tex[i]);
+                first = MPMIN(first, i);
+                memset(&tex[i], 0, sizeof(tex[i]));
             }
         }
 
-        // Multiple planes share the same dimensions and type, merge them for
-        // upscaling/debanding efficiency
-        if (o != -1) {
-            GLSLF("// merging plane %d into %d\n", o, n);
-
-            int num = 0;
+        if (num > 0) {
+            GLSLF("// merging plane %d ... into %d\n", n, first);
             copy_img_tex(p, &num, tex[n]);
-            copy_img_tex(p, &num, tex[o]);
             finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[n] = img_tex_fbo(&p->merge_fbo[n], identity_trans,
-                                 tex[n].type, num);
-
-            memset(&tex[o], 0, sizeof(tex[o]));
-            continue;
+            tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
+            memset(&tex[n], 0, sizeof(tex[n]));
         }
+    }
 
-        // The steps after this point (debanding, upscaling) can't handle
-        // integer textures, so the plane is still in that format by this point
-        // we need to ensure it gets converted
+    // If any textures are still in integer format by this point, we need
+    // to introduce an explicit conversion pass to avoid breaking hooks/scaling
+    for (int n = 0; n < 4; n++) {
         if (tex[n].use_integer) {
             GLSLF("// use_integer fix for plane %d\n", n);
 
             copy_img_tex(p, &(int){0}, tex[n]);
             finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[n] = img_tex_fbo(&p->integer_fbo[n], identity_trans,
-                                 tex[n].type, tex[n].components);
-            continue;
+            tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
+                                 tex[n].components);
         }
+    }
 
-        // Plane is not yet debanded
-        if (needs_deband[n]) {
-            GLSLF("// debanding plane %d\n", n);
+    // Dispatch the hooks for all of these textures, saving and perhaps
+    // modifying them in the process
+    for (int n = 0; n < 4; n++) {
+        const char *name;
+        switch (tex[n].type) {
+        case PLANE_RGB:    name = "RGB";    break;
+        case PLANE_LUMA:   name = "LUMA";   break;
+        case PLANE_CHROMA: name = "CHROMA"; break;
+        case PLANE_ALPHA:  name = "ALPHA";  break;
+        case PLANE_XYZ:    name = "XYZ";    break;
+        default: continue;
+        }
 
-            int id = pass_bind(p, tex[n]);
-            pass_sample_deband(p->sc, p->opts.deband_opts, id, tex[n].multiplier,
-                               tex[n].gl_target, &p->lfg);
-            skip_unused(p, tex[n].components);
-            finish_pass_fbo(p, &p->deband_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[n] = img_tex_fbo(&p->deband_fbo[n], identity_trans,
-                                 tex[n].type, tex[n].components);
+        tex[n] = pass_hook(p, name, tex[n], &offsets[n]);
+    }
 
-            needs_deband[n] = false;
-            continue;
+    // At this point all planes are finalized but they may not be at the
+    // required size yet. Furthermore, they may have texture offsets that
+    // require realignment. For lack of something better to do, we assume
+    // the rgb/luma texture is the "reference" and scale everything else
+    // to match.
+    for (int n = 0; n < 4; n++) {
+        switch (tex[n].type) {
+        case PLANE_RGB:
+        case PLANE_XYZ:
+        case PLANE_LUMA: break;
+        default: continue;
         }
 
-        // Plane still needs prescaling passes
-        if (needs_prescale[n]) {
-            GLSLF("// prescaling plane %d (%d left)\n", n, needs_prescale[n]);
-            pass_prescale_luma(p, &tex[n], &tex_trans,
-                               p->prescale_fbo[needs_prescale[n]-1]);
-            needs_prescale[n]--;
-
-            // We can skip scaling if we arrived at our target res
-            if (tex[n].w == dst_w && tex[n].h == dst_h)
-                scaler_id[n] = -1;
-
-            // If we're done prescaling, we need to adjust all of the
-            // other transforms to make sure the planes still align
-            if (needs_prescale[n] == 0) {
-                for (int i = 0; i < 4; i++) {
-                    if (n == i)
-                        continue;
-
-                    transforms[i].t[0] -= tex_trans.t[0] / tex_trans.m[0][0];
-                    transforms[i].t[1] -= tex_trans.t[1] / tex_trans.m[1][1];
-                }
-            }
+        p->texture_w = tex[n].w;
+        p->texture_h = tex[n].h;
+        p->texture_offset = offsets[n];
+        break;
+    }
+
+    // Compute the reference rect
+    struct mp_rect_f src = {0.0, 0.0, p->image_params.w, p->image_params.h};
+    struct mp_rect_f ref = src;
+    gl_transform_rect(p->texture_offset, &ref);
+    MP_DBG(p, "ref rect: {%f %f} {%f %f}\n", ref.x0, ref.y0, ref.x1, ref.y1);
+
+    // Explicitly scale all of the textures that don't match
+    for (int n = 0; n < 4; n++) {
+        if (tex[n].type == PLANE_NONE)
             continue;
-        }
 
-        // Plane is not yet upscaled
-        if (scaler_id[n] >= 0) {
-            const struct scaler_config *conf = &p->opts.scaler[scaler_id[n]];
-            struct scaler *scaler = &p->scaler[scaler_id[n]];
-
-            // This is the only step that actually uses the transform
-            tex[n].transform = transforms[n];
-
-            // Bilinear scaling is a no-op due to GPU sampling
-            if (strcmp(conf->kernel.name, "bilinear") != 0) {
-                GLSLF("// upscaling plane %d\n", n);
-                pass_sample(p, tex[n], scaler, conf, 1.0, dst_w, dst_h);
-                finish_pass_fbo(p, &p->scale_fbo[n], dst_w, dst_h, FBOTEX_FUZZY);
-                tex[n] = img_tex_fbo(&p->scale_fbo[n], identity_trans,
-                                     tex[n].type, tex[n].components);
-                transforms[n] = identity_trans;
-            }
+        // If the planes are aligned identically, we will end up with the
+        // exact same source rectangle.
+        struct mp_rect_f rect = src;
+        gl_transform_rect(offsets[n], &rect);
+        MP_DBG(p, "rect[%d]: {%f %f} {%f %f}\n", n,
+               rect.x0, rect.y0, rect.x1, rect.y1);
+
+        if (mp_rect_f_seq(ref, rect))
+            continue;
+
+        // If the rectangles differ, then our planes have a different
+        // alignment and/or size. First of all, we have to compute the
+        // corrections required to meet the target rectangle
+        struct gl_transform fix = {
+            .m = {{(ref.x1 - ref.x0) / (rect.x1 - rect.x0), 0.0},
+                  {0.0, (ref.y1 - ref.y0) / (rect.y1 - rect.y0)}},
+            .t = {ref.x0, ref.y0},
+        };
+
+        // Since the scale in texture space is different from the scale in
+        // absolute terms, we have to scale the coefficients down to be
+        // relative to the texture's physical dimensions and local offset
+        struct gl_transform scale = {
+            .m = {{(float)tex[n].w / p->texture_w, 0.0},
+                  {0.0, (float)tex[n].h / p->texture_h}},
+            .t = {-rect.x0, -rect.y0},
+        };
+        gl_transform_trans(scale, &fix);
+        MP_DBG(p, "-> fix[%d] = {%f %f} + off {%f %f}\n", n,
+               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
+
+        // Since the texture transform is a function of the texture coordinates
+        // to texture space, rather than the other way around, we have to
+        // actually apply the *inverse* of this. Fortunately, calculating
+        // the inverse is relatively easy here.
+        fix.m[0][0] = 1.0 / fix.m[0][0];
+        fix.m[1][1] = 1.0 / fix.m[1][1];
+        fix.t[0] = fix.m[0][0] * -fix.t[0];
+        fix.t[1] = fix.m[1][1] * -fix.t[1];
+        gl_transform_trans(fix, &tex[n].transform);
+
+        int scaler_id = -1;
+        const char *name = NULL;
+        switch (tex[n].type) {
+        case PLANE_RGB:
+        case PLANE_LUMA:
+        case PLANE_XYZ:
+            scaler_id = SCALER_SCALE;
+            // these aren't worth hooking, fringe hypothetical cases only
+            break;
+        case PLANE_CHROMA:
+            scaler_id = SCALER_CSCALE;
+            name = "CHROMA_SCALED";
+            break;
+        case PLANE_ALPHA:
+            // alpha always uses bilinear
+            name = "ALPHA_SCALED";
+        }
 
-            scaler_id[n] = -1;
+        if (scaler_id < 0)
             continue;
+
+        const struct scaler_config *conf = &p->opts.scaler[scaler_id];
+        struct scaler *scaler = &p->scaler[scaler_id];
+
+        // bilinear scaling is a free no-op thanks to GPU sampling
+        if (strcmp(conf->kernel.name, "bilinear") != 0) {
+            GLSLF("// upscaling plane %d\n", n);
+            pass_sample(p, tex[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
+            finish_pass_fbo(p, &p->scale_fbo[n], p->texture_w, p->texture_h,
+                            FBOTEX_FUZZY);
+            tex[n] = img_tex_fbo(&p->scale_fbo[n], tex[n].type, tex[n].components);
         }
 
-        // Execution should never reach this point
-        abort();
+        // Run any post-scaling hooks
+        tex[n] = pass_hook(p, name, tex[n], NULL);
     }
 
     // All planes are of the same size and properly aligned at this point
@@ -1668,10 +1951,6 @@ static void pass_read_video(struct gl_video *p)
         if (tex[i].type != PLANE_NONE)
             copy_img_tex(p, &coord, tex[i]);
     }
-
-    p->texture_w = dst_w;
-    p->texture_h = dst_h;
-    p->texture_offset = tex_trans;
     p->components = coord;
 }
 
@@ -1679,7 +1958,7 @@ static void pass_read_video(struct gl_video *p)
 // transformations. Returns the ID of the texture unit it was bound to
 static int pass_read_fbo(struct gl_video *p, struct fbotex *fbo)
 {
-    struct img_tex tex = img_tex_fbo(fbo, identity_trans, PLANE_RGB, p->components);
+    struct img_tex tex = img_tex_fbo(fbo, PLANE_RGB, p->components);
     copy_img_tex(p, &(int){0}, tex);
 
     return pass_bind(p, tex);
@@ -1752,9 +2031,9 @@ static void pass_convert_yuv(struct gl_video *p)
     }
 
     p->components = 3;
-    if (!p->has_alpha || p->opts.alpha_mode == 0) { // none
+    if (!p->has_alpha || p->opts.alpha_mode == ALPHA_NO) {
         GLSL(color.a = 1.0;)
-    } else if (p->opts.alpha_mode == 2) { // blend against black
+    } else if (p->opts.alpha_mode == ALPHA_BLEND) {
         GLSL(color = vec4(color.rgb * color.a, 1.0);)
     } else { // alpha present in image
         p->components = 4;
@@ -1805,9 +2084,12 @@ static void pass_scale_main(struct gl_video *p)
     struct scaler_config scaler_conf = p->opts.scaler[SCALER_SCALE];
     if (p->opts.scaler_resizes_only && !downscaling && !upscaling) {
         scaler_conf.kernel.name = "bilinear";
-        // bilinear is going to be used, just remove all sub-pixel offsets.
-        p->texture_offset.t[0] = (int)p->texture_offset.t[0];
-        p->texture_offset.t[1] = (int)p->texture_offset.t[1];
+        // For scaler-resizes-only, we round the texture offset to
+        // the nearest round value in order to prevent ugly blurriness
+        // (in exchange for slightly shifting the image by up to half a
+        // subpixel)
+        p->texture_offset.t[0] = roundf(p->texture_offset.t[0]);
+        p->texture_offset.t[1] = roundf(p->texture_offset.t[1]);
     }
     if (downscaling && p->opts.scaler[SCALER_DSCALE].kernel.name) {
         scaler_conf = p->opts.scaler[SCALER_DSCALE];
@@ -1826,8 +2108,10 @@ static void pass_scale_main(struct gl_video *p)
 
     // Pre-conversion, like linear light/sigmoidization
     GLSLF("// scaler pre-conversion\n");
-    if (p->use_linear)
+    if (p->use_linear) {
         pass_linearize(p->sc, p->image_params.gamma);
+        pass_opt_hook_point(p, "LINEAR", NULL);
+    }
 
     bool use_sigmoid = p->use_linear && p->opts.sigmoid_upscaling && upscaling;
     float sig_center, sig_slope, sig_offset, sig_scale;
@@ -1842,8 +2126,11 @@ static void pass_scale_main(struct gl_video *p)
         sig_scale  = 1.0/(1+expf(sig_slope * (sig_center-1))) - sig_offset;
         GLSLF("color.rgb = %f - log(1.0/(color.rgb * %f + %f) - 1.0)/%f;\n",
                 sig_center, sig_scale, sig_offset, sig_slope);
+        pass_opt_hook_point(p, "SIGMOID", NULL);
     }
 
+    pass_opt_hook_point(p, "PREKERNEL", NULL);
+
     int vp_w = p->dst_rect.x1 - p->dst_rect.x0;
     int vp_h = p->dst_rect.y1 - p->dst_rect.y0;
     struct gl_transform transform;
@@ -1851,14 +2138,16 @@ static void pass_scale_main(struct gl_video *p)
 
     GLSLF("// main scaling\n");
     finish_pass_fbo(p, &p->indirect_fbo, p->texture_w, p->texture_h, 0);
-    struct img_tex src = img_tex_fbo(&p->indirect_fbo, transform,
-                                     PLANE_RGB, p->components);
+    struct img_tex src = img_tex_fbo(&p->indirect_fbo, PLANE_RGB, p->components);
+    gl_transform_trans(transform, &src.transform);
     pass_sample(p, src, scaler, &scaler_conf, scale_factor, vp_w, vp_h);
 
     // Changes the texture size to display size after main scaler.
     p->texture_w = vp_w;
     p->texture_h = vp_h;
 
+    pass_opt_hook_point(p, "POSTKERNEL", NULL);
+
     GLSLF("// scaler post-conversion\n");
     if (use_sigmoid) {
         // Inverse of the transformation above
@@ -1869,41 +2158,87 @@ static void pass_scale_main(struct gl_video *p)
 
 // Adapts the colors from the given color space to the display device's native
 // gamut.
-static void pass_colormanage(struct gl_video *p, enum mp_csp_prim prim_src,
+static void pass_colormanage(struct gl_video *p, float peak_src,
+                             enum mp_csp_prim prim_src,
                              enum mp_csp_trc trc_src)
 {
     GLSLF("// color management\n");
     enum mp_csp_trc trc_dst = p->opts.target_trc;
     enum mp_csp_prim prim_dst = p->opts.target_prim;
+    float peak_dst = p->opts.target_brightness;
 
     if (p->use_lut_3d) {
         // The 3DLUT is always generated against the original source space
         enum mp_csp_prim prim_orig = p->image_params.primaries;
         enum mp_csp_trc trc_orig = p->image_params.gamma;
 
+        // One exception: SMPTE ST.2084 is not implemented by LittleCMS
+        // for technical limitation reasons, so we use a gamma 2.2 input curve
+        // here instead. We could pick any value we want here, the difference
+        // is just coding efficiency.
+        if (trc_orig == MP_CSP_TRC_SMPTE_ST2084)
+            trc_orig = MP_CSP_TRC_GAMMA22;
+
         if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
             prim_dst = prim_orig;
             trc_dst = trc_orig;
-        } else {
-            p->use_lut_3d = false;
         }
     }
 
-    if (prim_dst == MP_CSP_PRIM_AUTO)
+    // When auto-guessing the output color params, just pick the source color
+    // params to preserve the authentic "look and feel" of wrong/naive players.
+    // Some exceptions apply to source spaces that even hardcore technoluddites
+    // would probably not enjoy viewing unaltered
+    if (prim_dst == MP_CSP_PRIM_AUTO) {
         prim_dst = prim_src;
+
+        // Avoid outputting very wide gamut content automatically, since the
+        // majority target audience has standard gamut displays
+        if (prim_dst == MP_CSP_PRIM_BT_2020 || prim_dst == MP_CSP_PRIM_PRO_PHOTO)
+            prim_dst = MP_CSP_PRIM_BT_709;
+    }
+
     if (trc_dst == MP_CSP_TRC_AUTO) {
         trc_dst = trc_src;
-        // Avoid outputting linear light at all costs
+        // Avoid outputting linear light at all costs. First try
+        // falling back to the image gamma (e.g. in the case that the input
+        // was linear light due to linear-scaling)
         if (trc_dst == MP_CSP_TRC_LINEAR)
             trc_dst = p->image_params.gamma;
-        if (trc_dst == MP_CSP_TRC_LINEAR)
+
+        // Failing that, pick gamma 2.2 as a reasonable default. This is also
+        // picked as a default for outputting HDR content
+        if (trc_dst == MP_CSP_TRC_LINEAR || trc_dst == MP_CSP_TRC_SMPTE_ST2084)
             trc_dst = MP_CSP_TRC_GAMMA22;
     }
 
-    bool need_gamma = trc_src != trc_dst || prim_src != prim_dst;
+    if (!peak_src) {
+        // If the source has no information known, it's display-referred
+        // (and should be treated relative to the specified desired peak_dst)
+        peak_src = peak_dst;
+    }
+
+    // All operations from here on require linear light as a starting point,
+    // so we linearize even if trc_src == trc_dst when one of the other
+    // operations needs it
+    bool need_gamma = trc_src != trc_dst || prim_src != prim_dst ||
+                      peak_src != peak_dst;
     if (need_gamma)
         pass_linearize(p->sc, trc_src);
 
+    // Adapt and tone map for a different reference peak brightness
+    if (peak_src != peak_dst)
+    {
+        GLSLF("// HDR tone mapping\n");
+        float rel_peak = peak_src / peak_dst;
+        // Normalize such that 1 is the target brightness (and values above
+        // 1 are out of range)
+        GLSLF("color.rgb *= vec3(%f);\n", rel_peak);
+        // Tone map back down to the range [0,1]
+        pass_tone_map(p->sc, rel_peak, p->opts.hdr_tone_mapping,
+                      p->opts.tone_mapping_param);
+    }
+
     // Adapt to the right colorspace if necessary
     if (prim_src != prim_dst) {
         struct mp_csp_primaries csp_src = mp_get_csp_primaries(prim_src),
@@ -1914,8 +2249,14 @@ static void pass_colormanage(struct gl_video *p, enum mp_csp_prim prim_src,
         GLSL(color.rgb = cms_matrix * color.rgb;)
     }
 
-    if (need_gamma)
+    if (need_gamma) {
+        // If the target encoding function has a fixed peak, we need to
+        // un-normalize back to the encoding signal range
+        if (trc_dst == MP_CSP_TRC_SMPTE_ST2084)
+            GLSLF("color.rgb *= vec3(%f);\n", peak_dst / 10000);
+
         pass_delinearize(p->sc, trc_dst);
+    }
 
     if (p->use_lut_3d) {
         gl_sc_uniform_sampler(p->sc, "lut_3d", GL_TEXTURE_3D, TEXUNIT_3DLUT);
@@ -1928,11 +2269,11 @@ static void pass_dither(struct gl_video *p)
     GL *gl = p->gl;
 
     // Assume 8 bits per component if unknown.
-    int dst_depth = gl->fb_g ? gl->fb_g : 8;
+    int dst_depth = p->fb_depth;
     if (p->opts.dither_depth > 0)
         dst_depth = p->opts.dither_depth;
 
-    if (p->opts.dither_depth < 0 || p->opts.dither_algo < 0)
+    if (p->opts.dither_depth < 0 || p->opts.dither_algo == DITHER_NONE)
         return;
 
     if (!p->dither_texture) {
@@ -1940,12 +2281,12 @@ static void pass_dither(struct gl_video *p)
 
         int tex_size;
         void *tex_data;
-        GLint tex_iformat;
-        GLint tex_format;
+        GLint tex_iformat = 0;
+        GLint tex_format = 0;
         GLenum tex_type;
         unsigned char temp[256];
 
-        if (p->opts.dither_algo == 0) {
+        if (p->opts.dither_algo == DITHER_FRUIT) {
             int sizeb = p->opts.dither_size;
             int size = 1 << sizeb;
 
@@ -1956,15 +2297,14 @@ static void pass_dither(struct gl_video *p)
                 p->last_dither_matrix_size = size;
             }
 
-            const struct fmt_entry *fmt = find_tex_format(gl, 2, 1);
-            tex_size = size;
             // Prefer R16 texture since they provide higher precision.
-            if (fmt->internal_format) {
+            const struct gl_format *fmt = gl_find_unorm_format(gl, 2, 1);
+            if (!fmt || gl->es)
+                fmt = gl_find_float16_format(gl, 1);
+            tex_size = size;
+            if (fmt) {
                 tex_iformat = fmt->internal_format;
                 tex_format = fmt->format;
-            } else {
-                tex_iformat = gl_float16_formats[0].internal_format;
-                tex_format = gl_float16_formats[0].format;
             }
             tex_type = GL_FLOAT;
             tex_data = p->last_dither_matrix;
@@ -1972,7 +2312,7 @@ static void pass_dither(struct gl_video *p)
             assert(sizeof(temp) >= 8 * 8);
             mp_make_ordered_dither_matrix(temp, 8);
 
-            const struct fmt_entry *fmt = find_tex_format(gl, 1, 1);
+            const struct gl_format *fmt = gl_find_unorm_format(gl, 1, 1);
             tex_size = 8;
             tex_iformat = fmt->internal_format;
             tex_format = fmt->format;
@@ -1987,7 +2327,7 @@ static void pass_dither(struct gl_video *p)
         gl->BindTexture(GL_TEXTURE_2D, p->dither_texture);
         gl->PixelStorei(GL_UNPACK_ALIGNMENT, 1);
         gl->TexImage2D(GL_TEXTURE_2D, 0, tex_iformat, tex_size, tex_size, 0,
-                    tex_format, tex_type, tex_data);
+                       tex_format, tex_type, tex_data);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
         gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_REPEAT);
@@ -2057,9 +2397,12 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
         default:
             abort();
         }
-        // Subtitle color management, they're assumed to be sRGB by default
-        if (cms)
-            pass_colormanage(p, MP_CSP_PRIM_BT_709, MP_CSP_TRC_SRGB);
+        // Subtitle color management, they're assumed to be display-referred
+        // sRGB by default
+        if (cms) {
+            pass_colormanage(p, p->opts.target_brightness,
+                             MP_CSP_PRIM_BT_709, MP_CSP_TRC_SRGB);
+        }
         gl_sc_set_vao(p->sc, mpgl_osd_get_vao(p->osd));
         gl_sc_gen_shader_and_reset(p->sc);
         mpgl_osd_draw_part(p->osd, vp_w, vp_h, n);
@@ -2073,19 +2416,19 @@ static void pass_render_frame_dumb(struct gl_video *p, int fbo)
     p->gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
 
     struct img_tex tex[4];
-    pass_get_img_tex(p, &p->image, tex);
+    struct gl_transform off[4];
+    pass_get_img_tex(p, &p->image, tex, off);
 
     struct gl_transform transform;
     compute_src_transform(p, &transform);
 
-    struct gl_transform tchroma = transform;
-    tchroma.t[0] /= 1 << p->image_desc.chroma_xs;
-    tchroma.t[1] /= 1 << p->image_desc.chroma_ys;
-
     int index = 0;
     for (int i = 0; i < p->plane_count; i++) {
-        gl_transform_trans(tex[i].type == PLANE_CHROMA ? tchroma : transform,
-                           &tex[i].transform);
+        struct gl_transform trel = {{{(float)p->texture_w / tex[i].w, 0.0},
+                                     {0.0, (float)p->texture_h / tex[i].h}}};
+        gl_transform_trans(trel, &tex[i].transform);
+        gl_transform_trans(transform, &tex[i].transform);
+        gl_transform_trans(off[i], &tex[i].transform);
         copy_img_tex(p, &index, tex[i]);
     }
 
@@ -2101,6 +2444,8 @@ static void pass_render_frame(struct gl_video *p)
     p->texture_h = p->image_params.h;
     p->texture_offset = identity_trans;
     p->components = 0;
+    p->saved_tex_num = 0;
+    p->hook_fbo_num = 0;
 
     if (p->image_params.rotate % 180 == 90)
         MPSWAP(int, p->texture_w, p->texture_h);
@@ -2108,16 +2453,23 @@ static void pass_render_frame(struct gl_video *p)
     if (p->dumb_mode)
         return;
 
+    // start the render timer here. it will continue to the end of this
+    // function, to render the time needed to draw (excluding screen
+    // presentation)
+    gl_timer_start(p->render_timer);
+
     p->use_linear = p->opts.linear_scaling || p->opts.sigmoid_upscaling;
     pass_read_video(p);
+    pass_opt_hook_point(p, "NATIVE", &p->texture_offset);
     pass_convert_yuv(p);
+    pass_opt_hook_point(p, "MAINPRESUB", &p->texture_offset);
 
     // For subtitles
     double vpts = p->image.mpi->pts;
     if (vpts == MP_NOPTS_VALUE)
         vpts = p->osd_pts;
 
-    if (p->osd && p->opts.blend_subs == 2) {
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO) {
         double scale[2];
         get_scale_factors(p, false, scale);
         struct mp_osd_res rect = {
@@ -2130,20 +2482,13 @@ static void pass_render_frame(struct gl_video *p)
         GLSL(color = texture(texture0, texcoord0);)
         pass_read_fbo(p, &p->blend_subs_fbo);
     }
-
-    apply_shaders(p, p->opts.pre_shaders, p->texture_w, p->texture_h, p->pre_fbo);
-
-    if (p->opts.unsharp != 0.0) {
-        finish_pass_fbo(p, &p->unsharp_fbo, p->texture_w, p->texture_h, 0);
-        int id = pass_read_fbo(p, &p->unsharp_fbo);
-        pass_sample_unsharp(p->sc, id, p->opts.unsharp);
-    }
+    pass_opt_hook_point(p, "MAIN", &p->texture_offset);
 
     pass_scale_main(p);
 
     int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
         vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-    if (p->osd && p->opts.blend_subs == 1) {
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES) {
         // Recreate the real video size from the src/dst rects
         struct mp_osd_res rect = {
             .w = vp_w, .h = vp_h,
@@ -2157,22 +2502,26 @@ static void pass_render_frame(struct gl_video *p)
         rect.ml *= scale[0]; rect.mr *= scale[0];
         rect.mt *= scale[1]; rect.mb *= scale[1];
         // We should always blend subtitles in non-linear light
-        if (p->use_linear)
+        if (p->use_linear) {
             pass_delinearize(p->sc, p->image_params.gamma);
+            p->use_linear = false;
+        }
         finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h,
                         FBOTEX_FUZZY);
         pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
                       p->texture_w, p->texture_h, p->blend_subs_fbo.fbo, false);
         pass_read_fbo(p, &p->blend_subs_fbo);
-        if (p->use_linear)
-            pass_linearize(p->sc, p->image_params.gamma);
     }
 
-    apply_shaders(p, p->opts.post_shaders, p->texture_w, p->texture_h, p->post_fbo);
+    pass_opt_hook_point(p, "SCALED", NULL);
+
+    gl_timer_stop(p->render_timer);
 }
 
 static void pass_draw_to_screen(struct gl_video *p, int fbo)
 {
+    gl_timer_start(p->present_timer);
+
     if (p->dumb_mode)
         pass_render_frame_dumb(p, fbo);
 
@@ -2183,19 +2532,23 @@ static void pass_draw_to_screen(struct gl_video *p, int fbo)
         GLSL(color.rgb = pow(color.rgb, vec3(user_gamma));)
     }
 
-    pass_colormanage(p, p->image_params.primaries,
+    pass_colormanage(p, p->image_params.peak, p->image_params.primaries,
                      p->use_linear ? MP_CSP_TRC_LINEAR : p->image_params.gamma);
 
     // Draw checkerboard pattern to indicate transparency
-    if (p->has_alpha && p->opts.alpha_mode == 3) {
+    if (p->has_alpha && p->opts.alpha_mode == ALPHA_BLEND_TILES) {
         GLSLF("// transparency checkerboard\n");
         GLSL(bvec2 tile = lessThan(fract(gl_FragCoord.xy / 32.0), vec2(0.5));)
         GLSL(vec3 background = vec3(tile.x == tile.y ? 1.0 : 0.75);)
         GLSL(color.rgb = mix(background, color.rgb, color.a);)
     }
 
+    pass_opt_hook_point(p, "OUTPUT", NULL);
+
     pass_dither(p);
     finish_pass_direct(p, fbo, p->vp_w, p->vp_h, &p->dst_rect);
+
+    gl_timer_stop(p->present_timer);
 }
 
 // Draws an interpolate frame to fbo, based on the frame timing in t
@@ -2214,7 +2567,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // First of all, figure out if we have a frame availble at all, and draw
     // it manually + reset the queue if not
     if (p->surfaces[p->surface_now].pts == MP_NOPTS_VALUE) {
-        gl_video_upload_image(p, t->current);
+        if (!gl_video_upload_image(p, t->current))
+            return;
         pass_render_frame(p);
         finish_pass_fbo(p, &p->surfaces[p->surface_now].fbotex,
                         vp_w, vp_h, FBOTEX_FUZZY);
@@ -2273,7 +2627,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
             continue;
 
         if (f->pts > p->surfaces[p->surface_idx].pts) {
-            gl_video_upload_image(p, f);
+            if (!gl_video_upload_image(p, f))
+                return;
             pass_render_frame(p);
             finish_pass_fbo(p, &p->surfaces[surface_dst].fbotex,
                             vp_w, vp_h, FBOTEX_FUZZY);
@@ -2349,7 +2704,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
         for (int i = 0; i < size; i++) {
             struct img_tex img =
                 img_tex_fbo(&p->surfaces[fbosurface_wrap(surface_bse+i)].fbotex,
-                            identity_trans, PLANE_RGB, p->components);
+                            PLANE_RGB, p->components);
             // Since the code in pass_sample_separated currently assumes
             // the textures are bound in-order and starting at 0, we just
             // assert to make sure this is the case (which it should always be)
@@ -2366,12 +2721,24 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     p->frames_drawn += 1;
 }
 
+static void timer_dbg(struct gl_video *p, const char *name, struct gl_timer *t)
+{
+    if (gl_timer_sample_count(t) > 0) {
+        MP_DBG(p, "%s time: last %dus avg %dus peak %dus\n", name,
+               (int)gl_timer_last_us(t),
+               (int)gl_timer_avg_us(t),
+               (int)gl_timer_peak_us(t));
+    }
+}
+
 // (fbo==0 makes BindFramebuffer select the screen backbuffer)
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
 {
     GL *gl = p->gl;
     struct video_image *vimg = &p->image;
 
+    p->broken_frame = false;
+
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
 
     bool has_frame = frame->current || vimg->mpi;
@@ -2402,7 +2769,8 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
             if (is_new || !p->output_fbo_valid) {
                 p->output_fbo_valid = false;
 
-                gl_video_upload_image(p, frame->current);
+                if (!gl_video_upload_image(p, frame->current))
+                    goto done;
                 pass_render_frame(p);
 
                 // For the non-interplation case, we draw to a single "cache"
@@ -2438,6 +2806,10 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
         }
     }
 
+done:
+
+    unmap_current_image(p);
+
     debug_check_gl(p, "after video rendering");
 
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
@@ -2447,8 +2819,15 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
                       p->osd_pts, p->osd_rect, p->vp_w, p->vp_h, fbo, true);
         debug_check_gl(p, "after OSD rendering");
     }
-
     gl->UseProgram(0);
+
+    if (gl_sc_error_state(p->sc) || p->broken_frame) {
+        // Make the screen solid blue to make it visually clear that an
+        // error has occurred
+        gl->ClearColor(0.0, 0.05, 0.5, 1.0);
+        gl->Clear(GL_COLOR_BUFFER_BIT);
+    }
+
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
 
     // The playloop calls this last before waiting some time until it decides
@@ -2457,6 +2836,11 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo)
     gl->Flush();
 
     p->frames_rendered++;
+
+    // Report performance metrics
+    timer_dbg(p, "upload", p->upload_timer);
+    timer_dbg(p, "render", p->render_timer);
+    timer_dbg(p, "present", p->present_timer);
 }
 
 // vp_w/vp_h is the implicit size of the target framebuffer.
@@ -2472,11 +2856,30 @@ void gl_video_resize(struct gl_video *p, int vp_w, int vp_h,
     p->vp_h = vp_h;
 
     gl_video_reset_surfaces(p);
+    gl_video_setup_hooks(p);
 
     if (p->osd)
         mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo_out);
 }
 
+static struct voctrl_performance_entry gl_video_perfentry(struct gl_timer *t)
+{
+    return (struct voctrl_performance_entry) {
+        .last = gl_timer_last_us(t),
+        .avg  = gl_timer_avg_us(t),
+        .peak = gl_timer_peak_us(t),
+    };
+}
+
+struct voctrl_performance_data gl_video_perfdata(struct gl_video *p)
+{
+    return (struct voctrl_performance_data) {
+        .upload = gl_video_perfentry(p->upload_timer),
+        .render = gl_video_perfentry(p->render_timer),
+        .present = gl_video_perfentry(p->present_timer),
+    };
+}
+
 static bool unmap_image(struct gl_video *p, struct mp_image *mpi)
 {
     GL *gl = p->gl;
@@ -2504,15 +2907,17 @@ static bool map_image(struct gl_video *p, struct mp_image *mpi)
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *plane = &vimg->planes[n];
         mpi->stride[n] = mp_image_plane_w(mpi, n) * p->image_desc.bytes[n];
+        size_t buffer_size = mp_image_plane_h(mpi, n) * mpi->stride[n];
         if (!plane->gl_buffer) {
             gl->GenBuffers(1, &plane->gl_buffer);
             gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
-            size_t buffer_size = mp_image_plane_h(mpi, n) * mpi->stride[n];
             gl->BufferData(GL_PIXEL_UNPACK_BUFFER, buffer_size,
                            NULL, GL_DYNAMIC_DRAW);
         }
         gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
-        mpi->planes[n] = gl->MapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY);
+        mpi->planes[n] = gl->MapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0,
+                                            buffer_size, GL_MAP_WRITE_BIT |
+                                                GL_MAP_INVALIDATE_BUFFER_BIT);
         gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
         if (!mpi->planes[n]) {
             unmap_image(p, mpi);
@@ -2523,30 +2928,102 @@ static bool map_image(struct gl_video *p, struct mp_image *mpi)
     return true;
 }
 
-static void gl_video_upload_image(struct gl_video *p, struct mp_image *mpi)
+// This assumes nv12, with textures set to GL_NEAREST filtering.
+static void reinterleave_vdpau(struct gl_video *p, struct gl_hwdec_frame *frame)
+{
+    struct gl_hwdec_frame res = {0};
+    for (int n = 0; n < 2; n++) {
+        struct fbotex *fbo = &p->vdpau_deinterleave_fbo[n];
+        // This is an array of the 2 to-merge planes.
+        struct gl_hwdec_plane *src = &frame->planes[n * 2];
+        int w = src[0].tex_w;
+        int h = src[0].tex_h;
+        int ids[2];
+        for (int t = 0; t < 2; t++) {
+            ids[t] = pass_bind(p, (struct img_tex){
+                .gl_tex = src[t].gl_texture,
+                .gl_target = src[t].gl_target,
+                .multiplier = 1.0,
+                .transform = identity_trans,
+                .tex_w = w,
+                .tex_h = h,
+                .w = w,
+                .h = h,
+            });
+        }
+
+        GLSLF("color = fract(gl_FragCoord.y / 2) < 0.5\n");
+        GLSLF("      ? texture(texture%d, texcoord%d)\n", ids[0], ids[0]);
+        GLSLF("      : texture(texture%d, texcoord%d);", ids[1], ids[1]);
+
+        fbotex_change(fbo, p->gl, p->log, w, h * 2, n == 0 ? GL_R8 : GL_RG8, 0);
+
+        finish_pass_direct(p, fbo->fbo, fbo->rw, fbo->rh,
+                           &(struct mp_rect){0, 0, w, h * 2});
+
+        res.planes[n] = (struct gl_hwdec_plane){
+            .gl_texture = fbo->texture,
+            .gl_target = GL_TEXTURE_2D,
+            .tex_w = w,
+            .tex_h = h * 2,
+        };
+    }
+    *frame = res;
+}
+
+// Returns false on failure.
+static bool gl_video_upload_image(struct gl_video *p, struct mp_image *mpi)
 {
     GL *gl = p->gl;
     struct video_image *vimg = &p->image;
 
+    unref_current_image(p);
+
     mpi = mp_image_new_ref(mpi);
     if (!mpi)
-        abort();
+        goto error;
 
-    talloc_free(vimg->mpi);
     vimg->mpi = mpi;
     p->osd_pts = mpi->pts;
     p->frames_uploaded++;
 
     if (p->hwdec_active) {
-        GLuint imgtex[4] = {0};
-        bool ok = p->hwdec->driver->map_image(p->hwdec, vimg->mpi, imgtex) >= 0;
-        for (int n = 0; n < p->plane_count; n++)
-            vimg->planes[n].gl_texture = ok ? imgtex[n] : -1;
-        return;
+        // Hardware decoding
+        struct gl_hwdec_frame gl_frame = {0};
+        gl_timer_start(p->upload_timer);
+        bool ok = p->hwdec->driver->map_frame(p->hwdec, vimg->mpi, &gl_frame) >= 0;
+        gl_timer_stop(p->upload_timer);
+        vimg->hwdec_mapped = true;
+        if (ok) {
+            struct mp_image layout = {0};
+            mp_image_set_params(&layout, &p->image_params);
+            if (gl_frame.vdpau_fields)
+                reinterleave_vdpau(p, &gl_frame);
+            for (int n = 0; n < p->plane_count; n++) {
+                struct gl_hwdec_plane *plane = &gl_frame.planes[n];
+                vimg->planes[n] = (struct texplane){
+                    .w = mp_image_plane_w(&layout, n),
+                    .h = mp_image_plane_h(&layout, n),
+                    .tex_w = plane->tex_w,
+                    .tex_h = plane->tex_h,
+                    .gl_target = plane->gl_target,
+                    .gl_texture = plane->gl_texture,
+                };
+                snprintf(vimg->planes[n].swizzle, sizeof(vimg->planes[n].swizzle),
+                         "%s", plane->swizzle);
+            }
+        } else {
+            MP_FATAL(p, "Mapping hardware decoded surface failed.\n");
+            goto error;
+        }
+        return true;
     }
 
+    // Software decoding
     assert(mpi->num_planes == p->plane_count);
 
+    gl_timer_start(p->upload_timer);
+
     mp_image_t pbo_mpi = *mpi;
     bool pbo = map_image(p, &pbo_mpi);
     if (pbo) {
@@ -2567,28 +3044,36 @@ static void gl_video_upload_image(struct gl_video *p, struct mp_image *mpi)
             gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, plane->gl_buffer);
         gl->ActiveTexture(GL_TEXTURE0 + n);
         gl->BindTexture(plane->gl_target, plane->gl_texture);
-        glUploadTex(gl, plane->gl_target, plane->gl_format, plane->gl_type,
-                    mpi->planes[n], mpi->stride[n], 0, 0, plane->w, plane->h, 0);
+        gl_upload_tex(gl, plane->gl_target, plane->gl_format, plane->gl_type,
+                      mpi->planes[n], mpi->stride[n], 0, 0, plane->w, plane->h);
     }
     gl->ActiveTexture(GL_TEXTURE0);
     if (pbo)
         gl->BindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    gl_timer_stop(p->upload_timer);
+
+    return true;
+
+error:
+    unref_current_image(p);
+    p->broken_frame = true;
+    return false;
 }
 
-static bool test_fbo(struct gl_video *p)
+static bool test_fbo(struct gl_video *p, GLint format)
 {
     GL *gl = p->gl;
     bool success = false;
-    MP_VERBOSE(p, "Testing user-set FBO format (0x%x)\n",
-                   (unsigned)p->opts.fbo_format);
+    MP_VERBOSE(p, "Testing FBO format 0x%x\n", (unsigned)format);
     struct fbotex fbo = {0};
-    if (fbotex_init(&fbo, p->gl, p->log, 16, 16, p->opts.fbo_format)) {
+    if (fbotex_init(&fbo, p->gl, p->log, 16, 16, format)) {
         gl->BindFramebuffer(GL_FRAMEBUFFER, fbo.fbo);
         gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
         success = true;
     }
     fbotex_uninit(&fbo);
-    glCheckError(gl, p->log, "FBO test");
+    gl_check_error(gl, p->log, "FBO test");
     return success;
 }
 
@@ -2603,7 +3088,7 @@ static bool check_dumb_mode(struct gl_video *p)
         return true;
     if (o->target_prim || o->target_trc || o->linear_scaling ||
         o->correct_downscaling || o->sigmoid_upscaling || o->interpolation ||
-        o->blend_subs || o->deband || o->unsharp || o->prescale_luma)
+        o->blend_subs || o->deband || o->unsharp)
         return false;
     // check remaining scalers (tscale is already implicitly excluded above)
     for (int i = 0; i < SCALER_COUNT; i++) {
@@ -2617,6 +3102,8 @@ static bool check_dumb_mode(struct gl_video *p)
         return false;
     if (o->post_shaders && o->post_shaders[0])
         return false;
+    if (o->user_shaders && o->user_shaders[0])
+        return false;
     if (p->use_lut_3d)
         return false;
     return true;
@@ -2626,24 +3113,31 @@ static bool check_dumb_mode(struct gl_video *p)
 static void check_gl_features(struct gl_video *p)
 {
     GL *gl = p->gl;
-    bool have_float_tex = gl->mpgl_caps & MPGL_CAP_FLOAT_TEX;
-    bool have_fbo = gl->mpgl_caps & MPGL_CAP_FB;
+    bool have_float_tex = !!gl_find_float16_format(gl, 1);
     bool have_3d_tex = gl->mpgl_caps & MPGL_CAP_3D_TEX;
-    bool have_mix = gl->glsl_version >= 130;
+    bool have_mglsl = gl->glsl_version >= 130; // modern GLSL (1st class arrays etc.)
     bool have_texrg = gl->mpgl_caps & MPGL_CAP_TEX_RG;
-
-    if (have_fbo) {
-        if (!p->opts.fbo_format) {
-            p->opts.fbo_format = GL_RGBA16;
-            if (gl->es)
-                p->opts.fbo_format = have_float_tex ? GL_RGBA16F : GL_RGB10_A2;
+    bool have_tex16 = !gl->es || (gl->mpgl_caps & MPGL_CAP_EXT16);
+
+    const GLint auto_fbo_fmts[] = {GL_RGBA16, GL_RGBA16F, GL_RGB10_A2,
+                                   GL_RGBA8, 0};
+    GLint user_fbo_fmts[] = {p->opts.fbo_format, 0};
+    const GLint *fbo_fmts = user_fbo_fmts[0] ? user_fbo_fmts : auto_fbo_fmts;
+    bool have_fbo = false;
+    for (int n = 0; fbo_fmts[n]; n++) {
+        GLint fmt = fbo_fmts[n];
+        const struct gl_format *f = gl_find_internal_format(gl, fmt);
+        if (f && (f->flags & F_CF) == F_CF && test_fbo(p, fmt)) {
+            MP_VERBOSE(p, "Using FBO format 0x%x.\n", (unsigned)fmt);
+            have_fbo = true;
+            p->opts.fbo_format = fmt;
+            break;
         }
-        have_fbo = test_fbo(p);
     }
 
-    if (gl->es && p->opts.pbo) {
+    if (!gl->MapBufferRange && p->opts.pbo) {
         p->opts.pbo = 0;
-        MP_WARN(p, "Disabling PBOs (GLES unsupported).\n");
+        MP_WARN(p, "Disabling PBOs (GL2.1/GLES2 unsupported).\n");
     }
 
     p->forced_dumb_mode = p->opts.dumb_mode || !have_fbo || !have_texrg;
@@ -2666,12 +3160,14 @@ static void check_gl_features(struct gl_video *p)
             .alpha_mode = p->opts.alpha_mode,
             .use_rectangle = p->opts.use_rectangle,
             .background = p->opts.background,
-            .dither_algo = -1,
+            .dither_algo = DITHER_NONE,
+            .target_brightness = p->opts.target_brightness,
+            .hdr_tone_mapping = p->opts.hdr_tone_mapping,
+            .tone_mapping_param = p->opts.tone_mapping_param,
         };
         for (int n = 0; n < SCALER_COUNT; n++)
             new_opts.scaler[n] = gl_video_opts_def.scaler[n];
-        assign_options(&p->opts, &new_opts);
-        p->opts.deband_opts = m_config_alloc_struct(NULL, &deband_conf);
+        set_options(p, &new_opts);
         return;
     }
     p->dumb_mode = false;
@@ -2687,59 +3183,45 @@ static void check_gl_features(struct gl_video *p)
             char *reason = NULL;
             if (!have_float_tex)
                 reason = "(float tex. missing)";
+            if (!have_mglsl)
+                reason = "(GLSL version too old)";
             if (reason) {
+                MP_WARN(p, "Disabling scaler #%d %s %s.\n", n,
+                        p->opts.scaler[n].kernel.name, reason);
+                // p->opts is a copy of p->opts_alloc => we can just mess with it.
                 p->opts.scaler[n].kernel.name = "bilinear";
-                MP_WARN(p, "Disabling scaler #%d %s.\n", n, reason);
+                if (n == SCALER_TSCALE)
+                    p->opts.interpolation = 0;
             }
         }
     }
 
     // GLES3 doesn't provide filtered 16 bit integer textures
     // GLES2 doesn't even provide 3D textures
-    if (p->use_lut_3d && (!have_3d_tex || gl->es)) {
+    if (p->use_lut_3d && (!have_3d_tex || !have_tex16)) {
         p->use_lut_3d = false;
-        MP_WARN(p, "Disabling color management (GLES unsupported).\n");
+        MP_WARN(p, "Disabling color management (no RGB16 3D textures).\n");
     }
 
     int use_cms = p->opts.target_prim != MP_CSP_PRIM_AUTO ||
                   p->opts.target_trc != MP_CSP_TRC_AUTO || p->use_lut_3d;
 
     // mix() is needed for some gamma functions
-    if (!have_mix && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
+    if (!have_mglsl && (p->opts.linear_scaling || p->opts.sigmoid_upscaling)) {
         p->opts.linear_scaling = false;
         p->opts.sigmoid_upscaling = false;
         MP_WARN(p, "Disabling linear/sigmoid scaling (GLSL version too old).\n");
     }
-    if (!have_mix && use_cms) {
+    if (!have_mglsl && use_cms) {
         p->opts.target_prim = MP_CSP_PRIM_AUTO;
         p->opts.target_trc = MP_CSP_TRC_AUTO;
         p->use_lut_3d = false;
         MP_WARN(p, "Disabling color management (GLSL version too old).\n");
     }
-    if (!have_mix && p->opts.deband) {
+    if (!have_mglsl && p->opts.deband) {
         p->opts.deband = 0;
         MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
     }
-
-    if (p->opts.prescale_luma == 2) {
-        if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_UBO) {
-            // Check features for uniform buffer objects.
-            if (!gl->BindBufferBase || !gl->GetUniformBlockIndex) {
-                MP_WARN(p, "Disabling NNEDI3 (%s required).\n",
-                        gl->es ? "OpenGL ES 3.0" : "OpenGL 3.1");
-                p->opts.prescale_luma = 0;
-            }
-        } else if (p->opts.nnedi3_opts->upload == NNEDI3_UPLOAD_SHADER) {
-            // Check features for hard coding approach.
-            if ((!gl->es && gl->glsl_version < 330) ||
-                (gl->es && gl->glsl_version < 300))
-            {
-                MP_WARN(p, "Disabling NNEDI3 (%s required).\n",
-                        gl->es ? "OpenGL ES 3.0" : "OpenGL 3.3");
-                p->opts.prescale_luma = 0;
-            }
-        }
-    }
 }
 
 static void init_gl(struct gl_video *p)
@@ -2748,9 +3230,6 @@ static void init_gl(struct gl_video *p)
 
     debug_check_gl(p, "before init_gl");
 
-    MP_VERBOSE(p, "Reported display depth: R=%d, G=%d, B=%d\n",
-               gl->fb_r, gl->fb_g, gl->fb_b);
-
     gl->Disable(GL_DITHER);
 
     gl_vao_init(&p->vao, gl, sizeof(struct vertex), vertex_vao);
@@ -2759,8 +3238,8 @@ static void init_gl(struct gl_video *p)
 
     // Test whether we can use 10 bit. Hope that testing a single format/channel
     // is good enough (instead of testing all 1-4 channels variants etc.).
-    const struct fmt_entry *fmt = find_tex_format(gl, 2, 1);
-    if (gl->GetTexLevelParameteriv && fmt->format) {
+    const struct gl_format *fmt = gl_find_unorm_format(gl, 2, 1);
+    if (gl->GetTexLevelParameteriv && fmt) {
         GLuint tex;
         gl->GenTextures(1, &tex);
         gl->BindTexture(GL_TEXTURE_2D, tex);
@@ -2781,6 +3260,34 @@ static void init_gl(struct gl_video *p)
         gl->DeleteTextures(1, &tex);
     }
 
+    if ((gl->es >= 300 || gl->version) && (gl->mpgl_caps & MPGL_CAP_FB)) {
+        gl->BindFramebuffer(GL_FRAMEBUFFER, gl->main_fb);
+
+        GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
+        if (gl->main_fb)
+            obj = GL_COLOR_ATTACHMENT0;
+
+        GLint depth_r = -1, depth_g = -1, depth_b = -1;
+
+        gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE, &depth_r);
+        gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
+        gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE, &depth_b);
+
+        MP_VERBOSE(p, "Reported display depth: R=%d, G=%d, B=%d\n",
+                   depth_r, depth_g, depth_b);
+
+        p->fb_depth = depth_g > 0 ? depth_g : 8;
+
+        gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+    }
+
+    p->upload_timer = gl_timer_create(p->gl);
+    p->render_timer = gl_timer_create(p->gl);
+    p->present_timer = gl_timer_create(p->gl);
+
     debug_check_gl(p, "after init_gl");
 }
 
@@ -2799,16 +3306,25 @@ void gl_video_uninit(struct gl_video *p)
 
     gl->DeleteTextures(1, &p->lut_3d_texture);
 
+    gl_timer_free(p->upload_timer);
+    gl_timer_free(p->render_timer);
+    gl_timer_free(p->present_timer);
+
     mpgl_osd_destroy(p->osd);
 
     gl_set_debug_logger(gl, NULL);
 
-    assign_options(&p->opts, &(struct gl_video_opts){0});
     talloc_free(p);
 }
 
 void gl_video_set_gl_state(struct gl_video *p)
 {
+    // This resets certain important state to defaults.
+    gl_video_unset_gl_state(p);
+}
+
+void gl_video_unset_gl_state(struct gl_video *p)
+{
     GL *gl = p->gl;
 
     gl->ActiveTexture(GL_TEXTURE0);
@@ -2817,11 +3333,6 @@ void gl_video_set_gl_state(struct gl_video *p)
     gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
 }
 
-void gl_video_unset_gl_state(struct gl_video *p)
-{
-    /* nop */
-}
-
 void gl_video_reset(struct gl_video *p)
 {
     gl_video_reset_surfaces(p);
@@ -2833,40 +3344,42 @@ bool gl_video_showing_interpolated_frame(struct gl_video *p)
 }
 
 // dest = src.<w> (always using 4 components)
-static void packed_fmt_swizzle(char w[5], const struct fmt_entry *texfmt,
-                               const struct packed_fmt_entry *fmt)
+static void packed_fmt_swizzle(char w[5], const struct packed_fmt_entry *fmt)
 {
-    const char *comp = "rgba";
-
-    // Normally, we work with GL_RG
-    if (texfmt && texfmt->internal_format == GL_LUMINANCE_ALPHA)
-        comp = "ragb";
-
     for (int c = 0; c < 4; c++)
-        w[c] = comp[MPMAX(fmt->components[c] - 1, 0)];
+        w[c] = "rgba"[MPMAX(fmt->components[c] - 1, 0)];
     w[4] = '\0';
 }
 
-// Like find_tex_format(), but takes bits (not bytes), and but if no fixed point
-// format is available, return an unsigned integer format.
-static const struct fmt_entry *find_plane_format(GL *gl, int bytes_per_comp,
-                                                 int n_channels)
+// Like gl_find_unorm_format(), but takes bits (not bytes), and if no fixed
+// point format is available, return an unsigned integer format.
+static const struct gl_format *find_plane_format(GL *gl, int bits, int n_channels)
 {
-    const struct fmt_entry *e = find_tex_format(gl, bytes_per_comp, n_channels);
-    if (e->format || gl->es < 300)
-        return e;
-    return &gl_ui_byte_formats_gles3[n_channels - 1 + (bytes_per_comp - 1) * 4];
+    int bytes = (bits + 7) / 8;
+    const struct gl_format *f = gl_find_unorm_format(gl, bytes, n_channels);
+    if (f)
+        return f;
+    return gl_find_uint_format(gl, bytes, n_channels);
 }
 
-static bool init_format(int fmt, struct gl_video *init)
+static void init_image_desc(struct gl_video *p, int fmt)
 {
-    struct GL *gl = init->gl;
+    p->image_desc = mp_imgfmt_get_desc(fmt);
 
-    init->hwdec_active = false;
-    if (init->hwdec && init->hwdec->driver->imgfmt == fmt) {
-        fmt = init->hwdec->converted_imgfmt;
-        init->hwdec_active = true;
-    }
+    p->plane_count = p->image_desc.num_planes;
+    p->is_yuv = p->image_desc.flags & MP_IMGFLAG_YUV;
+    p->has_alpha = p->image_desc.flags & MP_IMGFLAG_ALPHA;
+    p->use_integer_conversion = false;
+    p->color_swizzle[0] = '\0';
+    p->is_packed_yuv = fmt == IMGFMT_UYVY || fmt == IMGFMT_YUYV;
+    p->hwdec_active = false;
+}
+
+// test_only=true checks if the format is supported
+// test_only=false also initializes some rendering parameters accordingly
+static bool init_format(struct gl_video *p, int fmt, bool test_only)
+{
+    struct GL *gl = p->gl;
 
     struct mp_imgfmt_desc desc = mp_imgfmt_get_desc(fmt);
     if (!desc.id)
@@ -2875,22 +3388,20 @@ static bool init_format(int fmt, struct gl_video *init)
     if (desc.num_planes > 4)
         return false;
 
-    const struct fmt_entry *plane_format[4] = {0};
-
-    init->color_swizzle[0] = '\0';
-    init->has_alpha = false;
+    const struct gl_format *plane_format[4] = {0};
+    char color_swizzle[5] = "";
+    const struct packed_fmt_entry *packed_format = {0};
 
     // YUV/planar formats
     if (desc.flags & (MP_IMGFLAG_YUV_P | MP_IMGFLAG_RGB_P)) {
         int bits = desc.component_bits;
         if ((desc.flags & MP_IMGFLAG_NE) && bits >= 8 && bits <= 16) {
-            init->has_alpha = desc.num_planes > 3;
-            plane_format[0] = find_plane_format(gl, (bits + 7) / 8, 1);
-            for (int p = 1; p < desc.num_planes; p++)
-                plane_format[p] = plane_format[0];
+            plane_format[0] = find_plane_format(gl, bits, 1);
+            for (int n = 1; n < desc.num_planes; n++)
+                plane_format[n] = plane_format[0];
             // RGB/planar
             if (desc.flags & MP_IMGFLAG_RGB_P)
-                snprintf(init->color_swizzle, sizeof(init->color_swizzle), "brga");
+                snprintf(color_swizzle, sizeof(color_swizzle), "brga");
             goto supported;
         }
     }
@@ -2899,50 +3410,37 @@ static bool init_format(int fmt, struct gl_video *init)
     if (desc.flags & MP_IMGFLAG_YUV_NV) {
         int bits = desc.component_bits;
         if ((desc.flags & MP_IMGFLAG_NE) && bits >= 8 && bits <= 16) {
-            plane_format[0] = find_plane_format(gl, (bits + 7) / 8, 1);
-            plane_format[1] = find_plane_format(gl, (bits + 7) / 8, 2);
+            plane_format[0] = find_plane_format(gl, bits, 1);
+            plane_format[1] = find_plane_format(gl, bits, 2);
             if (desc.flags & MP_IMGFLAG_YUV_NV_SWAP)
-                snprintf(init->color_swizzle, sizeof(init->color_swizzle), "rbga");
+                snprintf(color_swizzle, sizeof(color_swizzle), "rbga");
             goto supported;
         }
     }
 
     // XYZ (same organization as RGB packed, but requires conversion matrix)
     if (fmt == IMGFMT_XYZ12) {
-        plane_format[0] = find_tex_format(gl, 2, 3);
+        plane_format[0] = gl_find_unorm_format(gl, 2, 3);
         goto supported;
     }
 
-    // Packed RGB special formats
-    for (const struct fmt_entry *e = mp_to_gl_formats; e->mp_format; e++) {
-        if (!gl->es && e->mp_format == fmt) {
-            plane_format[0] = e;
-            goto supported;
-        }
-    }
-
     // Packed RGB(A) formats
     for (const struct packed_fmt_entry *e = mp_packed_formats; e->fmt; e++) {
         if (e->fmt == fmt) {
             int n_comp = desc.bytes[0] / e->component_size;
-            plane_format[0] = find_tex_format(gl, e->component_size, n_comp);
-            packed_fmt_swizzle(init->color_swizzle, plane_format[0], e);
-            init->has_alpha = e->components[3] != 0;
+            plane_format[0] = gl_find_unorm_format(gl, e->component_size, n_comp);
+            packed_format = e;
             goto supported;
         }
     }
 
-    // Packed YUV Apple formats
-    if (init->gl->mpgl_caps & MPGL_CAP_APPLE_RGB_422) {
-        for (const struct fmt_entry *e = gl_apple_formats; e->mp_format; e++) {
-            if (e->mp_format == fmt) {
-                init->is_packed_yuv = true;
-                snprintf(init->color_swizzle, sizeof(init->color_swizzle),
-                         "gbra");
-                plane_format[0] = e;
-                goto supported;
-            }
-        }
+    // Special formats for which OpenGL happens to have direct support.
+    plane_format[0] = gl_find_special_format(gl, fmt);
+    if (plane_format[0]) {
+        // Packed YUV Apple formats color permutation
+        if (plane_format[0]->format == GL_RGB_422_APPLE)
+            snprintf(color_swizzle, sizeof(color_swizzle), "gbra");
+        goto supported;
     }
 
     // Unsupported format
@@ -2951,46 +3449,56 @@ static bool init_format(int fmt, struct gl_video *init)
 supported:
 
     if (desc.component_bits > 8 && desc.component_bits < 16) {
-        if (init->texture_16bit_depth < 16)
+        if (p->texture_16bit_depth < 16)
             return false;
     }
 
     int use_integer = -1;
-    for (int p = 0; p < desc.num_planes; p++) {
-        if (!plane_format[p]->format)
+    for (int n = 0; n < desc.num_planes; n++) {
+        if (!plane_format[n])
             return false;
-        int use_int_plane = !!is_integer_format(plane_format[p]);
+        int use_int_plane = !!gl_integer_format_to_base(plane_format[n]->format);
         if (use_integer < 0)
             use_integer = use_int_plane;
         if (use_integer != use_int_plane)
             return false; // mixed planes not supported
     }
-    init->use_integer_conversion = use_integer;
 
-    if (init->use_integer_conversion && init->forced_dumb_mode)
+    if (use_integer && p->forced_dumb_mode)
         return false;
 
-    for (int p = 0; p < desc.num_planes; p++) {
-        struct texplane *plane = &init->image.planes[p];
-        const struct fmt_entry *format = plane_format[p];
-        assert(format);
-        plane->gl_format = format->format;
-        plane->gl_internal_format = format->internal_format;
-        plane->gl_type = format->type;
-        plane->use_integer = init->use_integer_conversion;
-    }
+    if (!test_only) {
+        for (int n = 0; n < desc.num_planes; n++) {
+            struct texplane *plane = &p->image.planes[n];
+            const struct gl_format *format = plane_format[n];
+            assert(format);
+            plane->gl_format = format->format;
+            plane->gl_internal_format = format->internal_format;
+            plane->gl_type = format->type;
+            plane->use_integer = use_integer;
+            snprintf(plane->swizzle, sizeof(plane->swizzle), "rgba");
+            if (packed_format)
+                packed_fmt_swizzle(plane->swizzle, packed_format);
+            if (plane->gl_format == GL_LUMINANCE_ALPHA)
+                MPSWAP(char, plane->swizzle[1], plane->swizzle[3]);
+        }
+
+        init_image_desc(p, fmt);
 
-    init->is_yuv = desc.flags & MP_IMGFLAG_YUV;
-    init->plane_count = desc.num_planes;
-    init->image_desc = desc;
+        p->use_integer_conversion = use_integer;
+        snprintf(p->color_swizzle, sizeof(p->color_swizzle), "%s", color_swizzle);
+    }
 
     return true;
 }
 
 bool gl_video_check_format(struct gl_video *p, int mp_format)
 {
-    struct gl_video tmp = *p;
-    return init_format(mp_format, &tmp);
+    if (init_format(p, mp_format, true))
+        return true;
+    if (p->hwdec && p->hwdec->driver->imgfmt == mp_format)
+        return true;
+    return false;
 }
 
 void gl_video_config(struct gl_video *p, struct mp_image_params *params)
@@ -3013,11 +3521,10 @@ void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd)
     mpgl_osd_destroy(p->osd);
     p->osd = NULL;
     p->osd_state = osd;
-    recreate_osd(p);
+    reinit_osd(p);
 }
 
-struct gl_video *gl_video_init(GL *gl, struct mp_log *log, struct mpv_global *g,
-                               struct gl_lcms *cms)
+struct gl_video *gl_video_init(GL *gl, struct mp_log *log, struct mpv_global *g)
 {
     if (gl->version < 210 && gl->es < 200) {
         mp_err(log, "At least OpenGL 2.1 or OpenGL ES 2.0 required.\n");
@@ -3029,17 +3536,15 @@ struct gl_video *gl_video_init(GL *gl, struct mp_log *log, struct mpv_global *g,
         .gl = gl,
         .global = g,
         .log = log,
-        .cms = cms,
-        .opts = gl_video_opts_def,
-        .gl_target = GL_TEXTURE_2D,
+        .cms = gl_lcms_init(p, log, g),
         .texture_16bit_depth = 16,
         .sc = gl_sc_create(gl, log),
     };
+    set_options(p, NULL);
     for (int n = 0; n < SCALER_COUNT; n++)
         p->scaler[n] = (struct scaler){.index = n};
     gl_video_set_debug(p, true);
     init_gl(p);
-    recreate_osd(p);
     return p;
 }
 
@@ -3062,62 +3567,32 @@ static const char *handle_scaler_opt(const char *name, bool tscale)
     return NULL;
 }
 
-static char **dup_str_array(void *parent, char **src)
-{
-    if (!src)
-        return NULL;
-
-    char **res = talloc_new(parent);
-    int num = 0;
-    for (int n = 0; src && src[n]; n++)
-        MP_TARRAY_APPEND(res, res, num, talloc_strdup(res, src[n]));
-    MP_TARRAY_APPEND(res, res, num, NULL);
-    return res;
-}
-
-static void assign_options(struct gl_video_opts *dst, struct gl_video_opts *src)
+static void set_options(struct gl_video *p, struct gl_video_opts *src)
 {
-    talloc_free(dst->scale_shader);
-    talloc_free(dst->pre_shaders);
-    talloc_free(dst->post_shaders);
-    talloc_free(dst->deband_opts);
-    talloc_free(dst->superxbr_opts);
-    talloc_free(dst->nnedi3_opts);
-
-    *dst = *src;
-
-    if (src->deband_opts)
-        dst->deband_opts = m_sub_options_copy(NULL, &deband_conf, src->deband_opts);
-
-    if (src->superxbr_opts) {
-        dst->superxbr_opts = m_sub_options_copy(NULL, &superxbr_conf,
-                                                src->superxbr_opts);
-    }
-
-    if (src->nnedi3_opts) {
-        dst->nnedi3_opts = m_sub_options_copy(NULL, &nnedi3_conf,
-                                                src->nnedi3_opts);
-    }
-
-    for (int n = 0; n < SCALER_COUNT; n++) {
-        dst->scaler[n].kernel.name =
-            (char *)handle_scaler_opt(dst->scaler[n].kernel.name,
-                                      n == SCALER_TSCALE);
-    }
-
-    dst->scale_shader = talloc_strdup(NULL, dst->scale_shader);
-    dst->pre_shaders = dup_str_array(NULL, dst->pre_shaders);
-    dst->post_shaders = dup_str_array(NULL, dst->post_shaders);
+    talloc_free(p->opts_alloc);
+    p->opts_alloc = m_sub_options_copy(p, &gl_video_conf, src);
+    p->opts = *p->opts_alloc;
 }
 
 // Set the options, and possibly update the filter chain too.
 // Note: assumes all options are valid and verified by the option parser.
 void gl_video_set_options(struct gl_video *p, struct gl_video_opts *opts)
 {
-    assign_options(&p->opts, opts);
+    set_options(p, opts);
+    reinit_from_options(p);
+}
+
+static void reinit_from_options(struct gl_video *p)
+{
+    p->use_lut_3d = false;
+
+    gl_lcms_set_options(p->cms, p->opts.icc_opts);
+    p->use_lut_3d = gl_lcms_has_profile(p->cms);
 
     check_gl_features(p);
     uninit_rendering(p);
+    gl_video_setup_hooks(p);
+    reinit_osd(p);
 
     if (p->opts.interpolation && !p->global->opts->video_sync && !p->dsi_warned) {
         MP_WARN(p, "Interpolation now requires enabling display-sync mode.\n"
@@ -3239,5 +3714,5 @@ void gl_video_set_ambient_lux(struct gl_video *p, int lux)
 void gl_video_set_hwdec(struct gl_video *p, struct gl_hwdec *hwdec)
 {
     p->hwdec = hwdec;
-    mp_image_unrefp(&p->image.mpi);
+    unref_current_image(p);
 }
diff --git a/video/out/opengl/video.h b/video/out/opengl/video.h
index 4702f8c..140a468 100644
--- a/video/out/opengl/video.h
+++ b/video/out/opengl/video.h
@@ -35,11 +35,6 @@
 #define TEXUNIT_3DLUT    (TEXUNIT_SCALERS+SCALER_COUNT)
 #define TEXUNIT_DITHER   (TEXUNIT_3DLUT+1)
 
-struct lut3d {
-    uint16_t *data;
-    int size[3];
-};
-
 struct scaler_fun {
     char *name;
     float params[2];
@@ -78,6 +73,33 @@ enum scaler_unit {
     SCALER_COUNT
 };
 
+enum dither_algo {
+    DITHER_NONE = 0,
+    DITHER_FRUIT,
+    DITHER_ORDERED,
+};
+
+enum alpha_mode {
+    ALPHA_NO = 0,
+    ALPHA_YES,
+    ALPHA_BLEND,
+    ALPHA_BLEND_TILES,
+};
+
+enum blend_subs_mode {
+    BLEND_SUBS_NO = 0,
+    BLEND_SUBS_YES,
+    BLEND_SUBS_VIDEO,
+};
+
+enum tone_mapping {
+    TONE_MAPPING_CLIP,
+    TONE_MAPPING_REINHARD,
+    TONE_MAPPING_HABLE,
+    TONE_MAPPING_GAMMA,
+    TONE_MAPPING_LINEAR,
+};
+
 struct gl_video_opts {
     int dumb_mode;
     struct scaler_config scaler[4];
@@ -86,6 +108,9 @@ struct gl_video_opts {
     int gamma_auto;
     int target_prim;
     int target_trc;
+    int target_brightness;
+    int hdr_tone_mapping;
+    float tone_mapping_param;
     int linear_scaling;
     int correct_downscaling;
     int sigmoid_upscaling;
@@ -108,14 +133,11 @@ struct gl_video_opts {
     char *scale_shader;
     char **pre_shaders;
     char **post_shaders;
+    char **user_shaders;
     int deband;
     struct deband_opts *deband_opts;
     float unsharp;
-    int prescale_luma;
-    int prescale_passes;
-    float prescale_downscaling_threshold;
-    struct superxbr_opts *superxbr_opts;
-    struct nnedi3_opts *nnedi3_opts;
+    struct mp_icc_opts *icc_opts;
 };
 
 extern const struct m_sub_options gl_video_conf;
@@ -125,19 +147,18 @@ extern const struct gl_video_opts gl_video_opts_def;
 struct gl_video;
 struct vo_frame;
 
-struct gl_video *gl_video_init(GL *gl, struct mp_log *log, struct mpv_global *g,
-                               struct gl_lcms *cms);
+struct gl_video *gl_video_init(GL *gl, struct mp_log *log, struct mpv_global *g);
 void gl_video_uninit(struct gl_video *p);
 void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
 void gl_video_set_options(struct gl_video *p, struct gl_video_opts *opts);
 bool gl_video_check_format(struct gl_video *p, int mp_format);
 void gl_video_config(struct gl_video *p, struct mp_image_params *params);
 void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
-void gl_video_update_profile(struct gl_video *p);
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame, int fbo);
 void gl_video_resize(struct gl_video *p, int vp_w, int vp_h,
                      struct mp_rect *src, struct mp_rect *dst,
                      struct mp_osd_res *osd);
+struct voctrl_performance_data gl_video_perfdata(struct gl_video *p);
 struct mp_csp_equalizer;
 struct mp_csp_equalizer *gl_video_eq_ptr(struct gl_video *p);
 void gl_video_eq_update(struct gl_video *p);
@@ -147,6 +168,8 @@ void gl_video_set_debug(struct gl_video *p, bool enable);
 float gl_video_scale_ambient_lux(float lmin, float lmax,
                                  float rmin, float rmax, float lux);
 void gl_video_set_ambient_lux(struct gl_video *p, int lux);
+void gl_video_set_icc_profile(struct gl_video *p, bstr icc_data);
+bool gl_video_icc_auto_enabled(struct gl_video *p);
 
 void gl_video_set_gl_state(struct gl_video *p);
 void gl_video_unset_gl_state(struct gl_video *p);
diff --git a/video/out/opengl/video_shaders.c b/video/out/opengl/video_shaders.c
index bea1bbf..1f37f4f 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/opengl/video_shaders.c
@@ -45,7 +45,7 @@ static void pass_sample_separated_get_weights(struct gl_shader_cache *sc,
 
     int N = scaler->kernel->size;
     if (N == 2) {
-        GLSL(vec2 c1 = texture(lut, vec2(0.5, fcoord_lut)).RG;)
+        GLSL(vec2 c1 = texture(lut, vec2(0.5, fcoord_lut)).rg;)
         GLSL(float weights[2] = float[](c1.r, c1.g);)
     } else if (N == 6) {
         GLSL(vec4 c1 = texture(lut, vec2(0.25, fcoord_lut));)
@@ -177,7 +177,7 @@ static void bicubic_calcweights(struct gl_shader_cache *sc, const char *t, const
     GLSLF("%s = %s * %s + vec4(0, 0, -0.5, 0.5);\n", t, t, s);
     GLSLF("%s = %s * %s + vec4(-0.6666, 0, 0.8333, 0.1666);\n", t, t, s);
     GLSLF("%s.xy *= vec2(1, 1) / vec2(%s.z, %s.w);\n", t, t, t);
-    GLSLF("%s.xy += vec2(1 + %s, 1 - %s);\n", t, s, s);
+    GLSLF("%s.xy += vec2(1.0 + %s, 1.0 - %s);\n", t, s, s);
 }
 
 void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
@@ -187,8 +187,8 @@ void pass_sample_bicubic_fast(struct gl_shader_cache *sc)
     bicubic_calcweights(sc, "parmx", "fcoord.x");
     bicubic_calcweights(sc, "parmy", "fcoord.y");
     GLSL(vec4 cdelta;)
-    GLSL(cdelta.xz = parmx.RG * vec2(-pt.x, pt.x);)
-    GLSL(cdelta.yw = parmy.RG * vec2(-pt.y, pt.y);)
+    GLSL(cdelta.xz = parmx.rg * vec2(-pt.x, pt.x);)
+    GLSL(cdelta.yw = parmy.rg * vec2(-pt.y, pt.y);)
     // first y-interpolation
     GLSL(vec4 ar = texture(tex, pos + cdelta.xy);)
     GLSL(vec4 ag = texture(tex, pos + cdelta.xw);)
@@ -208,34 +208,25 @@ void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
     GLSLF("{\n");
     GLSL(vec2 pos = pos + vec2(0.5) * pt;) // round to nearest
     GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
-    // We only need to sample from the four corner pixels since we're using
-    // nearest neighbour and can compute the exact transition point
-    GLSL(vec2 baseNW = pos - fcoord * pt;)
-    GLSL(vec2 baseNE = baseNW + vec2(pt.x, 0.0);)
-    GLSL(vec2 baseSW = baseNW + vec2(0.0, pt.y);)
-    GLSL(vec2 baseSE = baseNW + pt;)
     // Determine the mixing coefficient vector
     gl_sc_uniform_vec2(sc, "output_size", (float[2]){w, h});
-    GLSL(vec2 coeff = vec2((baseSE - pos) * output_size);)
-    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
+    GLSL(vec2 coeff = fcoord * output_size/size;)
     float threshold = scaler->conf.kernel.params[0];
-    if (threshold > 0) { // also rules out NAN
-        GLSLF("coeff = mix(coeff, vec2(0.0), "
-              "lessThanEqual(coeff, vec2(%f)));\n", threshold);
-        GLSLF("coeff = mix(coeff, vec2(1.0), "
-              "greaterThanEqual(coeff, vec2(%f)));\n", 1.0 - threshold);
-    }
+    threshold = isnan(threshold) ? 0.0 : threshold;
+    GLSLF("coeff = (coeff - %f) / %f;\n", threshold, 1.0 - 2 * threshold);
+    GLSL(coeff = clamp(coeff, 0.0, 1.0);)
     // Compute the right blend of colors
-    GLSL(vec4 left = mix(texture(tex, baseSW),
-                         texture(tex, baseNW),
-                         coeff.y);)
-    GLSL(vec4 right = mix(texture(tex, baseSE),
-                          texture(tex, baseNE),
-                          coeff.y);)
-    GLSL(color = mix(right, left, coeff.x);)
+    GLSL(color = texture(tex, pos + pt * (coeff - fcoord));)
     GLSLF("}\n");
 }
 
+// Common constants for SMPTE ST.2084 (HDR)
+static const float HDR_M1 = 2610./4096 * 1./4,
+                   HDR_M2 = 2523./4096 * 128,
+                   HDR_C1 = 3424./4096,
+                   HDR_C2 = 2413./4096 * 32,
+                   HDR_C3 = 2392./4096 * 32;
+
 // Linearize (expand), given a TRC as input
 void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 {
@@ -267,6 +258,15 @@ void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
                              pow(color.rgb, vec3(1.8)),
                              lessThan(vec3(0.03125), color.rgb));)
         break;
+    case MP_CSP_TRC_SMPTE_ST2084:
+        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", HDR_M2);
+        GLSLF("color.rgb = max(color.rgb - vec3(%f), vec3(0.0)) \n"
+              "             / (vec3(%f) - vec3(%f) * color.rgb);\n",
+              HDR_C1, HDR_C2, HDR_C3);
+        GLSLF("color.rgb = pow(color.rgb, vec3(1.0/%f));\n", HDR_M1);
+        break;
+    default:
+        abort();
     }
 }
 
@@ -301,12 +301,67 @@ void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
                              pow(color.rgb, vec3(1.0/1.8)),
                              lessThanEqual(vec3(0.001953), color.rgb));)
         break;
+    case MP_CSP_TRC_SMPTE_ST2084:
+        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", HDR_M1);
+        GLSLF("color.rgb = (vec3(%f) + vec3(%f) * color.rgb) \n"
+              "             / (vec3(1.0) + vec3(%f) * color.rgb);\n",
+              HDR_C1, HDR_C2, HDR_C3);
+        GLSLF("color.rgb = pow(color.rgb, vec3(%f));\n", HDR_M2);
+        break;
+    default:
+        abort();
+    }
+}
+
+// Tone map from a known peak brightness to the range [0,1]
+void pass_tone_map(struct gl_shader_cache *sc, float peak,
+                   enum tone_mapping algo, float param)
+{
+    switch (algo) {
+    case TONE_MAPPING_CLIP:
+        GLSL(color.rgb = clamp(color.rgb, 0.0, 1.0);)
+        break;
+
+    case TONE_MAPPING_REINHARD: {
+        float contrast = isnan(param) ? 0.5 : param,
+              offset = (1.0 - contrast) / contrast;
+        GLSLF("color.rgb = color.rgb / (color.rgb + vec3(%f));\n", offset);
+        GLSLF("color.rgb *= vec3(%f);\n", (peak + offset) / peak);
+        break;
+    }
+
+    case TONE_MAPPING_HABLE: {
+        float A = 0.15, B = 0.50, C = 0.10, D = 0.20, E = 0.02, F = 0.30;
+        GLSLHF("vec3 hable(vec3 x) {\n");
+        GLSLHF("return ((x * (%f*x + %f)+%f)/(x * (%f*x + %f) + %f)) - %f;\n",
+               A, C*B, D*E, A, B, D*F, E/F);
+        GLSLHF("}\n");
+
+        GLSLF("color.rgb = hable(color.rgb) / hable(vec3(%f));\n", peak);
+        break;
+    }
+
+    case TONE_MAPPING_GAMMA: {
+        float gamma = isnan(param) ? 1.8 : param;
+        GLSLF("color.rgb = pow(color.rgb / vec3(%f), vec3(%f));\n",
+              peak, 1.0/gamma);
+        break;
+    }
+
+    case TONE_MAPPING_LINEAR: {
+        float coeff = isnan(param) ? 1.0 : param;
+        GLSLF("color.rgb = vec3(%f) * color.rgb;\n", coeff / peak);
+        break;
+    }
+
+    default:
+        abort();
     }
 }
 
 // Wide usage friendly PRNG, shamelessly stolen from a GLSL tricks forum post.
 // Obtain random numbers by calling rand(h), followed by h = permute(h) to
-// update the state.
+// update the state. Assumes the texture was hooked.
 static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
 {
     GLSLH(float mod289(float x)  { return x - floor(x / 289.0) * 289.0; })
@@ -314,7 +369,7 @@ static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
     GLSLH(float rand(float x)    { return fract(x / 41.0); })
 
     // Initialize the PRNG by hashing the position + a random uniform
-    GLSL(vec3 _m = vec3(pos, random) + vec3(1.0);)
+    GLSL(vec3 _m = vec3(HOOKED_pos, random) + vec3(1.0);)
     GLSL(float h = permute(permute(permute(_m.x)+_m.y)+_m.z);)
     gl_sc_uniform_f(sc, "random", (double)av_lfg_get(lfg) / UINT32_MAX);
 }
@@ -347,44 +402,40 @@ const struct m_sub_options deband_conf = {
     .defaults = &deband_opts_def,
 };
 
-// Stochastically sample a debanded result from a given texture
+// Stochastically sample a debanded result from a hooked texture.
 void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
-                        int tex_num, float tex_mul, GLenum tex_target, AVLFG *lfg)
+                        AVLFG *lfg)
 {
-    // Set up common variables and initialize the PRNG
+    // Initialize the PRNG
     GLSLF("{\n");
-    sampler_prelude(sc, tex_num);
     prng_init(sc, lfg);
 
     // Helper: Compute a stochastic approximation of the avg color around a
     // pixel
-    GLSLHF("vec4 average(%s tex, vec2 pos, vec2 pt, float range, inout float h) {",
-           mp_sampler_type(tex_target));
+    GLSLHF("vec4 average(float range, inout float h) {\n");
         // Compute a random rangle and distance
         GLSLH(float dist = rand(h) * range;     h = permute(h);)
         GLSLH(float dir  = rand(h) * 6.2831853; h = permute(h);)
-
-        GLSLHF("pt *= dist;\n");
-        GLSLH(vec2 o = vec2(cos(dir), sin(dir));)
+        GLSLH(vec2 o = dist * vec2(cos(dir), sin(dir));)
 
         // Sample at quarter-turn intervals around the source pixel
         GLSLH(vec4 ref[4];)
-        GLSLH(ref[0] = texture(tex, pos + pt * vec2( o.x,  o.y));)
-        GLSLH(ref[1] = texture(tex, pos + pt * vec2(-o.y,  o.x));)
-        GLSLH(ref[2] = texture(tex, pos + pt * vec2(-o.x, -o.y));)
-        GLSLH(ref[3] = texture(tex, pos + pt * vec2( o.y, -o.x));)
+        GLSLH(ref[0] = HOOKED_texOff(vec2( o.x,  o.y));)
+        GLSLH(ref[1] = HOOKED_texOff(vec2(-o.y,  o.x));)
+        GLSLH(ref[2] = HOOKED_texOff(vec2(-o.x, -o.y));)
+        GLSLH(ref[3] = HOOKED_texOff(vec2( o.y, -o.x));)
 
         // Return the (normalized) average
-        GLSLHF("return %f * (ref[0] + ref[1] + ref[2] + ref[3])/4.0;\n", tex_mul);
-    GLSLH(})
+        GLSLH(return (ref[0] + ref[1] + ref[2] + ref[3])/4.0;)
+    GLSLHF("}\n");
 
     // Sample the source pixel
-    GLSLF("color = %f * texture(tex, pos);\n", tex_mul);
+    GLSL(color = HOOKED_tex(HOOKED_pos);)
     GLSLF("vec4 avg, diff;\n");
     for (int i = 1; i <= opts->iterations; i++) {
         // Sample the average pixel and use it instead of the original if
         // the difference is below the given threshold
-        GLSLF("avg = average(tex, pos, pt, %f, h);\n", i * opts->range);
+        GLSLF("avg = average(%f, h);\n", i * opts->range);
         GLSL(diff = abs(color - avg);)
         GLSLF("color = mix(avg, color, greaterThan(diff, vec4(%f)));\n",
               opts->threshold / (i * 16384.0));
@@ -399,23 +450,21 @@ void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
     GLSLF("}\n");
 }
 
-void pass_sample_unsharp(struct gl_shader_cache *sc, int tex_num, float param)
-{
+// Assumes the texture was hooked
+void pass_sample_unsharp(struct gl_shader_cache *sc, float param) {
     GLSLF("// unsharp\n");
-    sampler_prelude(sc, tex_num);
-
     GLSLF("{\n");
-    GLSL(vec2 st1 = pt * 1.2;)
-    GLSL(vec4 p = texture(tex, pos);)
-    GLSL(vec4 sum1 = texture(tex, pos + st1 * vec2(+1, +1))
-                   + texture(tex, pos + st1 * vec2(+1, -1))
-                   + texture(tex, pos + st1 * vec2(-1, +1))
-                   + texture(tex, pos + st1 * vec2(-1, -1));)
-    GLSL(vec2 st2 = pt * 1.5;)
-    GLSL(vec4 sum2 = texture(tex, pos + st2 * vec2(+1,  0))
-                   + texture(tex, pos + st2 * vec2( 0, +1))
-                   + texture(tex, pos + st2 * vec2(-1,  0))
-                   + texture(tex, pos + st2 * vec2( 0, -1));)
+    GLSL(float st1 = 1.2;)
+    GLSL(vec4 p = HOOKED_tex(HOOKED_pos);)
+    GLSL(vec4 sum1 = HOOKED_texOff(st1 * vec2(+1, +1))
+                   + HOOKED_texOff(st1 * vec2(+1, -1))
+                   + HOOKED_texOff(st1 * vec2(-1, +1))
+                   + HOOKED_texOff(st1 * vec2(-1, -1));)
+    GLSL(float st2 = 1.5;)
+    GLSL(vec4 sum2 = HOOKED_texOff(st2 * vec2(+1,  0))
+                   + HOOKED_texOff(st2 * vec2( 0, +1))
+                   + HOOKED_texOff(st2 * vec2(-1,  0))
+                   + HOOKED_texOff(st2 * vec2( 0, -1));)
     GLSL(vec4 t = p * 0.859375 + sum2 * -0.1171875 + sum1 * -0.09765625;)
     GLSLF("color = p + t * %f;\n", param);
     GLSLF("}\n");
diff --git a/video/out/opengl/video_shaders.h b/video/out/opengl/video_shaders.h
index e010fdb..0ee3d81 100644
--- a/video/out/opengl/video_shaders.h
+++ b/video/out/opengl/video_shaders.h
@@ -38,9 +38,12 @@ void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
 void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
 void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
 
+void pass_tone_map(struct gl_shader_cache *sc, float peak,
+                   enum tone_mapping algo, float param);
+
 void pass_sample_deband(struct gl_shader_cache *sc, struct deband_opts *opts,
-                        int tex_num, float tex_mul, GLenum tex_target, AVLFG *lfg);
+                        AVLFG *lfg);
 
-void pass_sample_unsharp(struct gl_shader_cache *sc, int tex_num, float param);
+void pass_sample_unsharp(struct gl_shader_cache *sc, float param);
 
 #endif
diff --git a/video/out/vo.c b/video/out/vo.c
index 3e7999a..07476ad 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -43,6 +43,7 @@
 #include "options/m_config.h"
 #include "common/msg.h"
 #include "common/global.h"
+#include "video/hwdec.h"
 #include "video/mp_image.h"
 #include "sub/osd.h"
 #include "osdep/io.h"
@@ -258,12 +259,12 @@ static struct vo *vo_create(bool probing, struct mpv_global *global,
     mp_input_set_mouse_transform(vo->input_ctx, NULL, NULL);
     if (vo->driver->encode != !!vo->encode_lavc_ctx)
         goto error;
-    struct m_config *config = m_config_from_obj_desc(vo, vo->log, &desc);
-    if (m_config_apply_defaults(config, name, vo->opts->vo_defs) < 0)
+    vo->config = m_config_from_obj_desc(vo, vo->log, &desc);
+    if (m_config_apply_defaults(vo->config, name, vo->opts->vo_defs) < 0)
         goto error;
-    if (m_config_set_obj_params(config, args) < 0)
+    if (m_config_set_obj_params(vo->config, args) < 0)
         goto error;
-    vo->priv = config->optstruct;
+    vo->priv = vo->config->optstruct;
 
     if (pthread_create(&vo->in->thread, NULL, vo_thread, vo))
         goto error;
@@ -595,14 +596,14 @@ static void wait_event_fd(struct vo *vo, int64_t until_time)
 
     if (fds[1].revents & POLLIN) {
         char buf[100];
-        read(in->wakeup_pipe[0], buf, sizeof(buf)); // flush
+        (void)read(in->wakeup_pipe[0], buf, sizeof(buf)); // flush
     }
 }
 static void wakeup_event_fd(struct vo *vo)
 {
     struct vo_internal *in = vo->in;
 
-    write(in->wakeup_pipe[1], &(char){0}, 1);
+    (void)write(in->wakeup_pipe[1], &(char){0}, 1);
 }
 #else
 static void wait_event_fd(struct vo *vo, int64_t until_time){}
diff --git a/video/out/vo.h b/video/out/vo.h
index 49a7546..9c29d5f 100644
--- a/video/out/vo.h
+++ b/video/out/vo.h
@@ -61,9 +61,8 @@ enum mp_voctrl {
     VOCTRL_SET_EQUALIZER,               // struct voctrl_set_equalizer_args*
     VOCTRL_GET_EQUALIZER,               // struct voctrl_get_equalizer_args*
 
-    /* for hardware decoding */
-    VOCTRL_GET_HWDEC_INFO,              // struct mp_hwdec_info**
-    VOCTRL_LOAD_HWDEC_API,              // private to vo_opengl
+    /* private to vo_opengl */
+    VOCTRL_LOAD_HWDEC_API,
 
     // Redraw the image previously passed to draw_image() (basically, repeat
     // the previous draw_image call). If this is handled, the OSD should also
@@ -78,6 +77,8 @@ enum mp_voctrl {
     VOCTRL_UPDATE_WINDOW_TITLE,         // char*
     VOCTRL_UPDATE_PLAYBACK_STATE,       // struct voctrl_playback_state*
 
+    VOCTRL_PERFORMANCE_DATA,            // struct voctrl_performance_data*
+
     VOCTRL_SET_CURSOR_VISIBILITY,       // bool*
 
     VOCTRL_KILL_SCREENSAVER,
@@ -132,11 +133,22 @@ struct voctrl_get_equalizer_args {
 
 // VOCTRL_UPDATE_PLAYBACK_STATE
 struct voctrl_playback_state {
+    bool taskbar_progress;
     bool playing;
     bool paused;
     int percent_pos;
 };
 
+// VOCTRL_PERFORMANCE_DATA
+struct voctrl_performance_entry {
+    // Times are in microseconds
+    uint64_t last, avg, peak;
+};
+
+struct voctrl_performance_data {
+    struct voctrl_performance_entry upload, render, present;
+};
+
 enum {
     // VO does handle mp_image_params.rotate in 90 degree steps
     VO_CAP_ROTATE90     = 1 << 0,
@@ -296,12 +308,14 @@ struct vo {
     struct vo_w32_state *w32;
     struct vo_cocoa_state *cocoa;
     struct vo_wayland_state *wayland;
+    struct mp_hwdec_devices *hwdec_devs;
     struct input_ctx *input_ctx;
     struct osd_state *osd;
     struct encode_lavc_context *encode_lavc_ctx;
     struct vo_internal *in;
     struct mp_vo_opts *opts;
     struct vo_extra extra;
+    struct m_config *config;
 
     // --- The following fields are generally only changed during initialization.
 
diff --git a/video/out/vo_direct3d.c b/video/out/vo_direct3d.c
index e074572..5190095 100644
--- a/video/out/vo_direct3d.c
+++ b/video/out/vo_direct3d.c
@@ -34,7 +34,6 @@
 #include "video/csputils.h"
 #include "video/mp_image.h"
 #include "video/img_format.h"
-#include "video/d3d.h"
 #include "common/msg.h"
 #include "common/common.h"
 #include "w32_common.h"
@@ -192,10 +191,6 @@ typedef struct d3d_priv {
     struct mp_csp_equalizer video_eq;
 
     struct osdpart *osd[MAX_OSD_PARTS];
-
-    struct mp_hwdec_info hwdec_info;
-    struct mp_hwdec_ctx hwdec_ctx;
-    struct mp_d3d_ctx hwdec_d3d;
 } d3d_priv;
 
 struct fmt_entry {
@@ -743,9 +738,6 @@ static bool change_d3d_backbuffer(d3d_priv *priv)
             MP_VERBOSE(priv, "Creating Direct3D device failed.\n");
             return 0;
         }
-
-        // (race condition if this is called when recovering from a "lost" device)
-        priv->hwdec_d3d.d3d9_device = priv->d3d_device;
     } else {
         if (FAILED(IDirect3DDevice9_Reset(priv->d3d_device, &present_params))) {
             MP_ERR(priv, "Reseting Direct3D device failed.\n");
@@ -779,8 +771,6 @@ static bool change_d3d_backbuffer(d3d_priv *priv)
 
 static void destroy_d3d(d3d_priv *priv)
 {
-    priv->hwdec_d3d.d3d9_device = NULL;
-
     destroy_d3d_surfaces(priv);
 
     for (int n = 0; n < NUM_SHADERS; n++) {
@@ -1225,9 +1215,6 @@ static int preinit(struct vo *vo)
     priv->vo = vo;
     priv->log = vo->log;
 
-    priv->hwdec_info.hwctx = &priv->hwdec_ctx;
-    priv->hwdec_ctx.d3d_ctx = &priv->hwdec_d3d;
-
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
         struct osdpart *osd = talloc_ptrtype(priv, osd);
         *osd = (struct osdpart) {
@@ -1275,11 +1262,6 @@ static int control(struct vo *vo, uint32_t request, void *data)
     d3d_priv *priv = vo->priv;
 
     switch (request) {
-    case VOCTRL_GET_HWDEC_INFO: {
-        struct mp_hwdec_info **arg = data;
-        *arg = &priv->hwdec_info;
-        return true;
-    }
     case VOCTRL_REDRAW_FRAME:
         d3d_draw_frame(priv);
         return VO_TRUE;
diff --git a/video/out/vo_lavc.c b/video/out/vo_lavc.c
index bd07d10..188a575 100644
--- a/video/out/vo_lavc.c
+++ b/video/out/vo_lavc.c
@@ -37,6 +37,7 @@ struct priv {
     uint8_t *buffer;
     size_t buffer_size;
     AVStream *stream;
+    AVCodecContext *codec;
     int have_first_packet;
 
     int harddup;
@@ -108,14 +109,14 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
          * warning here. We choose to ignore that; just because ffmpeg currently
          * uses a plain 'int' for these struct fields, it doesn't mean it always
          * will */
-        if (width == vc->stream->codec->width &&
-                height == vc->stream->codec->height) {
-            if (aspect.num != vc->stream->codec->sample_aspect_ratio.num ||
-                    aspect.den != vc->stream->codec->sample_aspect_ratio.den) {
+        if (width == vc->codec->width &&
+                height == vc->codec->height) {
+            if (aspect.num != vc->codec->sample_aspect_ratio.num ||
+                    aspect.den != vc->codec->sample_aspect_ratio.den) {
                 /* aspect-only changes are not critical */
                 MP_WARN(vo, "unsupported pixel aspect ratio change from %d:%d to %d:%d\n",
-                       vc->stream->codec->sample_aspect_ratio.num,
-                       vc->stream->codec->sample_aspect_ratio.den,
+                       vc->codec->sample_aspect_ratio.num,
+                       vc->codec->sample_aspect_ratio.den,
                        aspect.num, aspect.den);
             }
             goto done;
@@ -144,18 +145,20 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
         goto error;
     }
 
-    vc->stream = encode_lavc_alloc_stream(vo->encode_lavc_ctx,
-                                          AVMEDIA_TYPE_VIDEO);
-    vc->stream->sample_aspect_ratio = vc->stream->codec->sample_aspect_ratio =
+    if (encode_lavc_alloc_stream(vo->encode_lavc_ctx,
+                                 AVMEDIA_TYPE_VIDEO,
+                                 &vc->stream, &vc->codec) < 0)
+        goto error;
+    vc->stream->sample_aspect_ratio = vc->codec->sample_aspect_ratio =
             aspect;
-    vc->stream->codec->width = width;
-    vc->stream->codec->height = height;
-    vc->stream->codec->pix_fmt = pix_fmt;
+    vc->codec->width = width;
+    vc->codec->height = height;
+    vc->codec->pix_fmt = pix_fmt;
 
-    encode_lavc_set_csp(vo->encode_lavc_ctx, vc->stream, params->colorspace);
-    encode_lavc_set_csp_levels(vo->encode_lavc_ctx, vc->stream, params->colorlevels);
+    encode_lavc_set_csp(vo->encode_lavc_ctx, vc->codec, params->colorspace);
+    encode_lavc_set_csp_levels(vo->encode_lavc_ctx, vc->codec, params->colorlevels);
 
-    if (encode_lavc_open_codec(vo->encode_lavc_ctx, vc->stream) < 0)
+    if (encode_lavc_open_codec(vo->encode_lavc_ctx, vc->codec) < 0)
         goto error;
 
     vc->buffer_size = 6 * width * height + 200;
@@ -204,7 +207,7 @@ static void write_packet(struct vo *vo, int size, AVPacket *packet)
         packet->stream_index = vc->stream->index;
         if (packet->pts != AV_NOPTS_VALUE) {
             packet->pts = av_rescale_q(packet->pts,
-                                       vc->stream->codec->time_base,
+                                       vc->codec->time_base,
                                        vc->stream->time_base);
         } else {
             MP_VERBOSE(vo, "codec did not provide pts\n");
@@ -213,12 +216,12 @@ static void write_packet(struct vo *vo, int size, AVPacket *packet)
         }
         if (packet->dts != AV_NOPTS_VALUE) {
             packet->dts = av_rescale_q(packet->dts,
-                                       vc->stream->codec->time_base,
+                                       vc->codec->time_base,
                                        vc->stream->time_base);
         }
         if (packet->duration > 0) {
             packet->duration = av_rescale_q(packet->duration,
-                                       vc->stream->codec->time_base,
+                                       vc->codec->time_base,
                                        vc->stream->time_base);
         } else {
             // HACK: libavformat calculates dts wrong if the initial packet
@@ -226,15 +229,16 @@ static void write_packet(struct vo *vo, int size, AVPacket *packet)
             // have b-frames!
             if (!packet->duration)
                 if (!vc->have_first_packet)
-                    if (vc->stream->codec->has_b_frames
-                            || vc->stream->codec->max_b_frames)
+                    if (vc->codec->has_b_frames
+                            || vc->codec->max_b_frames)
                         if (vc->stream->time_base.num * 1000LL <=
                                 vc->stream->time_base.den)
                             packet->duration = FFMAX(1, av_rescale_q(1,
-                                 vc->stream->codec->time_base, vc->stream->time_base));
+                                 vc->codec->time_base, vc->stream->time_base));
         }
 
-        if (encode_lavc_write_frame(vo->encode_lavc_ctx, packet) < 0) {
+        if (encode_lavc_write_frame(vo->encode_lavc_ctx,
+                                    vc->stream, packet) < 0) {
             MP_ERR(vo, "error writing\n");
             return;
         }
@@ -246,30 +250,19 @@ static void write_packet(struct vo *vo, int size, AVPacket *packet)
 static int encode_video(struct vo *vo, AVFrame *frame, AVPacket *packet)
 {
     struct priv *vc = vo->priv;
-    if (encode_lavc_oformat_flags(vo->encode_lavc_ctx) & AVFMT_RAWPICTURE) {
-        if (!frame)
-            return 0;
-        memcpy(vc->buffer, frame, sizeof(AVPicture));
-        MP_DBG(vo, "got pts %f\n",
-               frame->pts * (double) vc->stream->codec->time_base.num /
-                            (double) vc->stream->codec->time_base.den);
-        packet->size = sizeof(AVPicture);
-        return packet->size;
-    } else {
-        int got_packet = 0;
-        int status = avcodec_encode_video2(vc->stream->codec, packet,
-                                           frame, &got_packet);
-        int size = (status < 0) ? status : got_packet ? packet->size : 0;
-
-        if (frame)
-            MP_DBG(vo, "got pts %f; out size: %d\n",
-                   frame->pts * (double) vc->stream->codec->time_base.num /
-                   (double) vc->stream->codec->time_base.den, size);
-
-        if (got_packet)
-            encode_lavc_write_stats(vo->encode_lavc_ctx, vc->stream);
-        return size;
-    }
+    int got_packet = 0;
+    int status = avcodec_encode_video2(vc->codec, packet,
+                                        frame, &got_packet);
+    int size = (status < 0) ? status : got_packet ? packet->size : 0;
+
+    if (frame)
+        MP_DBG(vo, "got pts %f; out size: %d\n",
+               frame->pts * (double) vc->codec->time_base.num /
+               (double) vc->codec->time_base.den, size);
+
+    if (got_packet)
+        encode_lavc_write_stats(vo->encode_lavc_ctx, vc->codec);
+    return size;
 }
 
 static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
@@ -295,7 +288,7 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
         pts = vc->expected_next_pts;
     }
 
-    avc = vc->stream->codec;
+    avc = vc->codec;
 
     if (vc->worst_time_base.den == 0) {
         //if (avc->time_base.num / avc->time_base.den >= vc->stream->time_base.num / vc->stream->time_base.den)
@@ -376,7 +369,7 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
     }
     vc->lastpts = outpts;
     ectx->last_video_in_pts = pts;
-    frameipts = floor((outpts + encode_lavc_getoffset(ectx, vc->stream))
+    frameipts = floor((outpts + encode_lavc_getoffset(ectx, vc->codec))
                       / timeunit + 0.5);
 
     // calculate expected pts of next video frame
@@ -396,7 +389,7 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
             MP_INFO(vo, "--oneverdrop increased pts by %d\n",
                     (int) (vc->lastipts - frameipts + step));
             frameipts = vc->lastipts + step;
-            vc->lastpts = frameipts * timeunit - encode_lavc_getoffset(ectx, vc->stream);
+            vc->lastpts = frameipts * timeunit - encode_lavc_getoffset(ectx, vc->codec);
         }
     }
 
@@ -417,16 +410,15 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
                 skipframes = 0;
 
             if (thisduration > skipframes) {
-                AVFrame *frame = av_frame_alloc();
+                AVFrame *frame = mp_image_to_av_frame(vc->lastimg);
+                if (!frame)
+                    abort();
 
                 // this is a nop, unless the worst time base is the STREAM time base
                 frame->pts = av_rescale_q(vc->lastipts + skipframes,
                                           vc->worst_time_base, avc->time_base);
 
-                enum AVPictureType savetype = frame->pict_type;
-                mp_image_copy_fields_to_av_frame(frame, vc->lastimg);
-                frame->pict_type = savetype;
-                    // keep this at avcodec_get_frame_defaults default
+                frame->pict_type = 0; // keep this at unknown/undefined
 
                 frame->quality = avc->global_quality;
 
diff --git a/video/out/vo_opengl.c b/video/out/vo_opengl.c
index dfef6ec..08b9b11 100644
--- a/video/out/vo_opengl.c
+++ b/video/out/vo_opengl.c
@@ -45,7 +45,6 @@
 #include "filter_kernels.h"
 #include "video/hwdec.h"
 #include "opengl/video.h"
-#include "opengl/lcms.h"
 
 #define NUM_VSYNC_FENCES 10
 
@@ -56,14 +55,15 @@ struct gl_priv {
     GL *gl;
 
     struct gl_video *renderer;
-    struct gl_lcms *cms;
 
     struct gl_hwdec *hwdec;
-    struct mp_hwdec_info hwdec_info;
+
+    int events;
+
+    void *original_opts;
 
     // Options
     struct gl_video_opts *renderer_opts;
-    struct mp_icc_opts *icc_opts;
     int use_glFinish;
     int waitvsync;
     int use_gl_debug;
@@ -130,7 +130,7 @@ static void draw_frame(struct vo *vo, struct vo_frame *frame)
             p->vsync_fences[p->num_vsync_fences++] = fence;
     }
 
-    gl_video_render_frame(p->renderer, frame, 0);
+    gl_video_render_frame(p->renderer, frame, gl->main_fb);
 
     if (p->use_glFinish)
         gl->Finish();
@@ -196,34 +196,31 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     return 0;
 }
 
-static void request_hwdec_api(struct gl_priv *p, const char *api_name)
+static void request_hwdec_api(struct vo *vo, void *api)
 {
+    struct gl_priv *p = vo->priv;
+
     if (p->hwdec)
         return;
 
-    p->hwdec = gl_hwdec_load_api(p->vo->log, p->gl, p->vo->global, api_name);
+    p->hwdec = gl_hwdec_load_api(p->vo->log, p->gl, p->vo->global,
+                                 vo->hwdec_devs, (intptr_t)api);
     gl_video_set_hwdec(p->renderer, p->hwdec);
-    if (p->hwdec)
-        p->hwdec_info.hwctx = p->hwdec->hwctx;
 }
 
-static void call_request_hwdec_api(struct mp_hwdec_info *info,
-                                   const char *api_name)
+static void call_request_hwdec_api(void *ctx, enum hwdec_type type)
 {
-    struct vo *vo = info->load_api_ctx;
-    assert(&((struct gl_priv *)vo->priv)->hwdec_info == info);
     // Roundabout way to run hwdec loading on the VO thread.
     // Redirects to request_hwdec_api().
-    vo_control(vo, VOCTRL_LOAD_HWDEC_API, (void *)api_name);
+    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t)type);
 }
 
-static void get_and_update_icc_profile(struct gl_priv *p, int *events)
+static void get_and_update_icc_profile(struct gl_priv *p)
 {
-    bool has_profile = p->icc_opts->profile && p->icc_opts->profile[0];
-    if (p->icc_opts->profile_auto && !has_profile) {
+    if (gl_video_icc_auto_enabled(p->renderer)) {
         MP_VERBOSE(p, "Querying ICC profile...\n");
         bstr icc = bstr0(NULL);
-        int r = mpgl_control(p->glctx, events, VOCTRL_GET_ICC_PROFILE, &icc);
+        int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
 
         if (r != VO_NOTAVAIL) {
             if (r == VO_FALSE) {
@@ -232,19 +229,15 @@ static void get_and_update_icc_profile(struct gl_priv *p, int *events)
                 MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
             }
 
-            gl_lcms_set_memory_profile(p->cms, &icc);
-            has_profile = true;
+            gl_video_set_icc_profile(p->renderer, icc);
         }
     }
-
-    if (has_profile)
-        gl_video_update_profile(p->renderer);
 }
 
-static void get_and_update_ambient_lighting(struct gl_priv *p, int *events)
+static void get_and_update_ambient_lighting(struct gl_priv *p)
 {
     int lux;
-    int r = mpgl_control(p->glctx, events, VOCTRL_GET_AMBIENT_LUX, &lux);
+    int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
     if (r == VO_TRUE) {
         gl_video_set_ambient_lux(p->renderer, lux);
     }
@@ -254,36 +247,31 @@ static void get_and_update_ambient_lighting(struct gl_priv *p, int *events)
     }
 }
 
-static bool reparse_cmdline(struct gl_priv *p, char *args)
+static const struct m_option options[];
+
+static const struct m_sub_options opengl_conf = {
+    .opts = options,
+    .size = sizeof(struct gl_priv),
+};
+
+static bool reparse_cmdline(struct vo *vo, char *args)
 {
-    struct m_config *cfg = NULL;
-    struct gl_priv *opts = NULL;
+    struct gl_priv *p = vo->priv;
     int r = 0;
 
-    // list of options which can be changed at runtime
-#define OPT_BASE_STRUCT struct gl_priv
-    static const struct m_option change_otps[] = {
-        OPT_SUBSTRUCT("", renderer_opts, gl_video_conf, 0),
-        {0}
-    };
-#undef OPT_BASE_STRUCT
+    struct gl_priv *opts = p;
 
     if (strcmp(args, "-") == 0) {
-        opts = p;
+        opts = p->original_opts;
     } else {
-        const struct gl_priv *vodef = p->vo->driver->priv_defaults;
-        cfg = m_config_new(NULL, p->vo->log, sizeof(*opts), vodef, change_otps);
-        opts = cfg->optstruct;
-        r = m_config_parse_suboptions(cfg, "opengl", args);
+        r = m_config_parse_suboptions(vo->config, "opengl", args);
     }
 
-    if (r >= 0) {
-        gl_video_set_options(p->renderer, opts->renderer_opts);
-        gl_video_configure_queue(p->renderer, p->vo);
-        p->vo->want_redraw = true;
-    }
+    gl_video_set_options(p->renderer, opts->renderer_opts);
+    get_and_update_icc_profile(p);
+    gl_video_configure_queue(p->renderer, p->vo);
+    p->vo->want_redraw = true;
 
-    talloc_free(cfg);
     return r >= 0;
 }
 
@@ -314,7 +302,7 @@ static int control(struct vo *vo, uint32_t request, void *data)
         return VO_NOTIMPL;
     }
     case VOCTRL_SCREENSHOT_WIN: {
-        struct mp_image *screen = glGetWindowScreenshot(p->gl);
+        struct mp_image *screen = gl_read_window_contents(p->gl);
         // set image parameters according to the display, if possible
         if (screen) {
             screen->params.primaries = p->renderer_opts->target_prim;
@@ -325,17 +313,12 @@ static int control(struct vo *vo, uint32_t request, void *data)
         *(struct mp_image **)data = screen;
         return true;
     }
-    case VOCTRL_GET_HWDEC_INFO: {
-        struct mp_hwdec_info **arg = data;
-        *arg = &p->hwdec_info;
-        return true;
-    }
     case VOCTRL_LOAD_HWDEC_API:
-        request_hwdec_api(p, data);
+        request_hwdec_api(vo, data);
         return true;
     case VOCTRL_SET_COMMAND_LINE: {
         char *arg = data;
-        return reparse_cmdline(p, arg);
+        return reparse_cmdline(vo, arg);
     }
     case VOCTRL_RESET:
         gl_video_reset(p->renderer);
@@ -346,18 +329,23 @@ static int control(struct vo *vo, uint32_t request, void *data)
             vo_wakeup(vo);
         }
         return true;
+    case VOCTRL_PERFORMANCE_DATA:
+        *(struct voctrl_performance_data *)data = gl_video_perfdata(p->renderer);
+        return true;
     }
 
     int events = 0;
     int r = mpgl_control(p->glctx, &events, request, data);
     if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
-        get_and_update_icc_profile(p, &events);
+        get_and_update_icc_profile(p);
         vo->want_redraw = true;
     }
     if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
-        get_and_update_ambient_lighting(p, &events);
+        get_and_update_ambient_lighting(p);
         vo->want_redraw = true;
     }
+    events |= p->events;
+    p->events = 0;
     if (events & VO_EVENT_RESIZE)
         resize(p);
     if (events & VO_EVENT_EXPOSE)
@@ -373,6 +361,10 @@ static void uninit(struct vo *vo)
 
     gl_video_uninit(p->renderer);
     gl_hwdec_uninit(p->hwdec);
+    if (vo->hwdec_devs) {
+        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
+        hwdec_devices_destroy(vo->hwdec_devs);
+    }
     mpgl_uninit(p->glctx);
 }
 
@@ -411,32 +403,30 @@ static int preinit(struct vo *vo)
         MP_VERBOSE(vo, "swap_control extension missing.\n");
     }
 
-    p->cms = gl_lcms_init(p, vo->log, vo->global);
-    if (!p->cms)
-        goto err_out;
-    p->renderer = gl_video_init(p->gl, vo->log, vo->global, p->cms);
+    p->renderer = gl_video_init(p->gl, vo->log, vo->global);
     if (!p->renderer)
         goto err_out;
     gl_video_set_osd_source(p->renderer, vo->osd);
     gl_video_set_options(p->renderer, p->renderer_opts);
     gl_video_configure_queue(p->renderer, vo);
 
-    gl_lcms_set_options(p->cms, p->icc_opts);
-    get_and_update_icc_profile(p, &(int){0});
+    get_and_update_icc_profile(p);
 
-    p->hwdec_info.load_api = call_request_hwdec_api;
-    p->hwdec_info.load_api_ctx = vo;
+    vo->hwdec_devs = hwdec_devices_create();
+
+    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
 
     int hwdec = vo->opts->hwdec_preload_api;
     if (hwdec == HWDEC_NONE)
         hwdec = vo->global->opts->hwdec_api;
     if (hwdec != HWDEC_NONE) {
-        p->hwdec = gl_hwdec_load_api_id(p->vo->log, p->gl, vo->global, hwdec);
+        p->hwdec = gl_hwdec_load_api(p->vo->log, p->gl, vo->global,
+                                     vo->hwdec_devs, hwdec);
         gl_video_set_hwdec(p->renderer, p->hwdec);
-        if (p->hwdec)
-            p->hwdec_info.hwctx = p->hwdec->hwctx;
     }
 
+    p->original_opts = m_sub_options_copy(p, &opengl_conf, p);
+
     return 0;
 
 err_out:
@@ -459,7 +449,6 @@ static const struct m_option options[] = {
     OPT_INTRANGE("vsync-fences", opt_vsync_fences, 0, 0, NUM_VSYNC_FENCES),
 
     OPT_SUBSTRUCT("", renderer_opts, gl_video_conf, 0),
-    OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
     {0},
 };
 
@@ -494,7 +483,6 @@ const struct vo_driver video_out_opengl_hq = {
     .priv_size = sizeof(struct gl_priv),
     .priv_defaults = &(const struct gl_priv){
         .renderer_opts = (struct gl_video_opts *)&gl_video_opts_hq_def,
-        .es = -1,
     },
     .options = options,
 };
diff --git a/video/out/vo_opengl_cb.c b/video/out/vo_opengl_cb.c
index 40930fb..4ac0c96 100644
--- a/video/out/vo_opengl_cb.c
+++ b/video/out/vo_opengl_cb.c
@@ -89,13 +89,16 @@ struct mpv_opengl_cb_context {
     struct vo *active;
     int hwdec_api;
 
+    // --- This is only mutable while initialized=false, during which nothing
+    //     except the OpenGL context manager is allowed to access it.
+    struct mp_hwdec_devices *hwdec_devs;
+
     // --- All of these can only be accessed from the thread where the host
     //     application's OpenGL context is current - i.e. only while the
     //     host application is calling certain mpv_opengl_cb_* APIs.
     GL *gl;
     struct gl_video *renderer;
     struct gl_hwdec *hwdec;
-    struct mp_hwdec_info hwdec_info; // it's also semi-immutable after init
 };
 
 static void update(struct vo_priv *p);
@@ -176,15 +179,14 @@ int mpv_opengl_cb_init_gl(struct mpv_opengl_cb_context *ctx, const char *exts,
 
     mpgl_load_functions2(ctx->gl, get_proc_address, get_proc_address_ctx,
                          exts, ctx->log);
-    ctx->renderer = gl_video_init(ctx->gl, ctx->log, ctx->global, NULL);
+    ctx->renderer = gl_video_init(ctx->gl, ctx->log, ctx->global);
     if (!ctx->renderer)
         return MPV_ERROR_UNSUPPORTED;
 
-    ctx->hwdec = gl_hwdec_load_api_id(ctx->log, ctx->gl, ctx->global,
-                                      ctx->hwdec_api);
+    ctx->hwdec_devs = hwdec_devices_create();
+    ctx->hwdec = gl_hwdec_load_api(ctx->log, ctx->gl, ctx->global,
+                                   ctx->hwdec_devs, ctx->hwdec_api);
     gl_video_set_hwdec(ctx->renderer, ctx->hwdec);
-    if (ctx->hwdec)
-        ctx->hwdec_info.hwctx = ctx->hwdec->hwctx;
 
     pthread_mutex_lock(&ctx->lock);
     // We don't know the exact caps yet - use a known superset
@@ -222,6 +224,8 @@ int mpv_opengl_cb_uninit_gl(struct mpv_opengl_cb_context *ctx)
     ctx->renderer = NULL;
     gl_hwdec_uninit(ctx->hwdec);
     ctx->hwdec = NULL;
+    hwdec_devices_destroy(ctx->hwdec_devs);
+    ctx->hwdec_devs = NULL;
     talloc_free(ctx->gl);
     ctx->gl = NULL;
     talloc_free(ctx->new_opts_cfg);
@@ -274,6 +278,8 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
             ctx->gl->debug_context = opts->use_gl_debug;
             gl_video_set_debug(ctx->renderer, opts->use_gl_debug);
         }
+        if (gl_video_icc_auto_enabled(ctx->renderer))
+            MP_ERR(ctx, "icc-profile-auto is not available with opengl-cb\n");
     }
     ctx->reconfigured = false;
     ctx->update_new_opts = false;
@@ -514,11 +520,6 @@ static int control(struct vo *vo, uint32_t request, void *data)
         char *arg = data;
         return reparse_cmdline(p, arg);
     }
-    case VOCTRL_GET_HWDEC_INFO: {
-        struct mp_hwdec_info **arg = data;
-        *arg = p->ctx ? &p->ctx->hwdec_info : NULL;
-        return true;
-    }
     }
 
     return VO_NOTIMPL;
@@ -561,6 +562,8 @@ static int preinit(struct vo *vo)
     p->ctx->eq_changed = true;
     pthread_mutex_unlock(&p->ctx->lock);
 
+    vo->hwdec_devs = p->ctx->hwdec_devs;
+
     return 0;
 }
 
diff --git a/video/out/vo_rpi.c b/video/out/vo_rpi.c
index 9d782fc..cd37362 100644
--- a/video/out/vo_rpi.c
+++ b/video/out/vo_rpi.c
@@ -30,11 +30,10 @@
 
 #include <libavutil/rational.h>
 
-#include "osdep/atomics.h"
-
 #include "common/common.h"
 #include "common/msg.h"
 #include "options/m_config.h"
+#include "osdep/timer.h"
 #include "vo.h"
 #include "win_state.h"
 #include "video/mp_image.h"
@@ -69,11 +68,10 @@ struct priv {
     // for RAM input
     MMAL_POOL_T *swpool;
 
-    atomic_bool update_display;
-
-    pthread_mutex_t vsync_mutex;
-    pthread_cond_t vsync_cond;
+    pthread_mutex_t display_mutex;
+    pthread_cond_t display_cond;
     int64_t vsync_counter;
+    bool reload_display;
 
     int background_layer;
     int video_layer;
@@ -89,6 +87,8 @@ struct priv {
 #define ALIGN_W 32
 #define ALIGN_H 16
 
+static void recreate_renderer(struct vo *vo);
+
 // Make mpi point to buffer, assuming MMAL_ENCODING_I420.
 // buffer can be NULL.
 // Return the required buffer space.
@@ -255,16 +255,18 @@ static int create_overlays(struct vo *vo)
     struct priv *p = vo->priv;
     destroy_overlays(vo);
 
-    if (vo->opts->fullscreen) {
-    // Use the whole screen.
-    VC_RECT_T dst = {.width = p->w, .height = p->h};
-    VC_RECT_T src = {.width = 1 << 16, .height = 1 << 16};
-    VC_DISPMANX_ALPHA_T alpha = {
-        .flags = DISPMANX_FLAGS_ALPHA_FIXED_ALL_PIXELS,
-        .opacity = 0xFF,
-    };
+    if (!p->display)
+        return -1;
+
+    if (vo->opts->fullscreen && p->background) {
+        // Use the whole screen.
+        VC_RECT_T dst = {.width = p->w, .height = p->h};
+        VC_RECT_T src = {.width = 1 << 16, .height = 1 << 16};
+        VC_DISPMANX_ALPHA_T alpha = {
+            .flags = DISPMANX_FLAGS_ALPHA_FIXED_ALL_PIXELS,
+            .opacity = 0xFF,
+        };
 
-    if (p->background) {
         p->window = vc_dispmanx_element_add(p->update, p->display,
                                             p->background_layer,
                                             &dst, 0, &src,
@@ -275,7 +277,6 @@ static int create_overlays(struct vo *vo)
             return -1;
         }
     }
-    }
 
     if (p->enable_osd) {
         VC_RECT_T dst = {.x = p->x, .y = p->y,
@@ -362,16 +363,23 @@ static int set_geometry(struct vo *vo)
 static void wait_next_vsync(struct vo *vo)
 {
     struct priv *p = vo->priv;
-    pthread_mutex_lock(&p->vsync_mutex);
+    pthread_mutex_lock(&p->display_mutex);
+    struct timespec end = mp_rel_time_to_timespec(0.050);
     int64_t old = p->vsync_counter;
-    while (old == p->vsync_counter)
-        pthread_cond_wait(&p->vsync_cond, &p->vsync_mutex);
-    pthread_mutex_unlock(&p->vsync_mutex);
+    while (old == p->vsync_counter && !p->reload_display) {
+        if (pthread_cond_timedwait(&p->display_cond, &p->display_mutex, &end))
+            break;
+    }
+    pthread_mutex_unlock(&p->display_mutex);
 }
 
 static void flip_page(struct vo *vo)
 {
     struct priv *p = vo->priv;
+
+    if (!p->renderer_enabled)
+        return;
+
     struct mp_image *mpi = p->next_image;
     p->next_image = NULL;
 
@@ -407,6 +415,9 @@ static void draw_frame(struct vo *vo, struct vo_frame *frame)
 {
     struct priv *p = vo->priv;
 
+    if (!p->renderer_enabled)
+        return;
+
     mp_image_t *mpi = NULL;
     if (!frame->redraw && !frame->repeat)
         mpi = mp_image_new_ref(frame->current);
@@ -435,8 +446,7 @@ static void draw_frame(struct vo *vo, struct vo_frame *frame)
         }
         mmal_buffer_header_reset(buffer);
 
-        struct mp_image *new_ref = mp_image_new_custom_ref(&(struct mp_image){0},
-                                                           buffer,
+        struct mp_image *new_ref = mp_image_new_custom_ref(NULL, buffer,
                                                            free_mmal_buffer);
         if (!new_ref) {
             mmal_buffer_header_release(buffer);
@@ -509,6 +519,9 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     MMAL_PORT_T *input = p->renderer->input[0];
     bool opaque = params->imgfmt == IMGFMT_MMAL;
 
+    if (!p->display)
+        return -1;
+
     disable_renderer(vo);
 
     input->format->encoding = opaque ? MMAL_ENCODING_OPAQUE : MMAL_ENCODING_I420;
@@ -563,6 +576,9 @@ static struct mp_image *take_screenshot(struct vo *vo)
 {
     struct priv *p = vo->priv;
 
+    if (!p->display)
+        return NULL;
+
     struct mp_image *img = mp_image_alloc(IMGFMT_BGR0, p->w, p->h);
     if (!img)
         return NULL;
@@ -615,14 +631,15 @@ static int control(struct vo *vo, uint32_t request, void *data)
     case VOCTRL_SCREENSHOT_WIN:
         *(struct mp_image **)data = take_screenshot(vo);
         return VO_TRUE;
-    case VOCTRL_CHECK_EVENTS:
-        if (atomic_load(&p->update_display)) {
-            atomic_store(&p->update_display, false);
-            update_display_size(vo);
-            if (p->renderer_enabled)
-                set_geometry(vo);
-        }
+    case VOCTRL_CHECK_EVENTS: {
+        pthread_mutex_lock(&p->display_mutex);
+        bool reload_required = p->reload_display;
+        p->reload_display = false;
+        pthread_mutex_unlock(&p->display_mutex);
+        if (reload_required)
+            recreate_renderer(vo);
         return VO_TRUE;
+    }
     case VOCTRL_GET_DISPLAY_FPS:
         *(double *)data = p->display_fps;
         return VO_TRUE;
@@ -636,7 +653,10 @@ static void tv_callback(void *callback_data, uint32_t reason, uint32_t param1,
 {
     struct vo *vo = callback_data;
     struct priv *p = vo->priv;
-    atomic_store(&p->update_display, true);
+    pthread_mutex_lock(&p->display_mutex);
+    p->reload_display = true;
+    pthread_cond_signal(&p->display_cond);
+    pthread_mutex_unlock(&p->display_mutex);
     vo_wakeup(vo);
 }
 
@@ -644,10 +664,59 @@ static void vsync_callback(DISPMANX_UPDATE_HANDLE_T u, void *arg)
 {
     struct vo *vo = arg;
     struct priv *p = vo->priv;
-    pthread_mutex_lock(&p->vsync_mutex);
+    pthread_mutex_lock(&p->display_mutex);
     p->vsync_counter += 1;
-    pthread_cond_signal(&p->vsync_cond);
-    pthread_mutex_unlock(&p->vsync_mutex);
+    pthread_cond_signal(&p->display_cond);
+    pthread_mutex_unlock(&p->display_mutex);
+}
+
+static void destroy_dispmanx(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+
+    disable_renderer(vo);
+    destroy_overlays(vo);
+
+    if (p->display) {
+        vc_dispmanx_vsync_callback(p->display, NULL, NULL);
+        vc_dispmanx_display_close(p->display);
+    }
+    p->display = 0;
+}
+
+static int recreate_dispmanx(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+
+    p->display = vc_dispmanx_display_open(p->display_nr);
+    p->update = vc_dispmanx_update_start(0);
+    if (!p->display || !p->update) {
+        MP_FATAL(vo, "Could not get DISPMANX objects.\n");
+        if (p->display)
+            vc_dispmanx_display_close(p->display);
+        p->display = 0;
+        p->update = 0;
+        return -1;
+    }
+
+    update_display_size(vo);
+
+    vc_dispmanx_vsync_callback(p->display, vsync_callback, vo);
+
+    return 0;
+}
+
+static void recreate_renderer(struct vo *vo)
+{
+    MP_WARN(vo, "Recreating renderer after display change.\n");
+
+    destroy_dispmanx(vo);
+    recreate_dispmanx(vo);
+
+    if (vo->params) {
+        if (reconfig(vo, vo->params) < 0)
+            MP_FATAL(vo, "Recreation failed.\n");
+    }
 }
 
 static void uninit(struct vo *vo)
@@ -658,25 +727,18 @@ static void uninit(struct vo *vo)
 
     talloc_free(p->next_image);
 
-    destroy_overlays(vo);
+    destroy_dispmanx(vo);
 
     if (p->update)
         vc_dispmanx_update_submit_sync(p->update);
 
-    if (p->renderer) {
-        disable_renderer(vo);
+    if (p->renderer)
         mmal_component_release(p->renderer);
-    }
-
-    if (p->display) {
-        vc_dispmanx_vsync_callback(p->display, NULL, NULL);
-        vc_dispmanx_display_close(p->display);
-    }
 
     mmal_vc_deinit();
 
-    pthread_cond_destroy(&p->vsync_cond);
-    pthread_mutex_destroy(&p->vsync_mutex);
+    pthread_cond_destroy(&p->display_cond);
+    pthread_mutex_destroy(&p->display_mutex);
 }
 
 static int preinit(struct vo *vo)
@@ -696,12 +758,14 @@ static int preinit(struct vo *vo)
         return -1;
     }
 
-    p->display = vc_dispmanx_display_open(p->display_nr);
-    p->update = vc_dispmanx_update_start(0);
-    if (!p->display || !p->update) {
-        MP_FATAL(vo, "Could not get DISPMANX objects.\n");
+    pthread_mutex_init(&p->display_mutex, NULL);
+    pthread_cond_init(&p->display_cond, NULL);
+
+    if (recreate_dispmanx(vo) < 0)
+        goto fail;
+
+    if (update_display_size(vo) < 0)
         goto fail;
-    }
 
     if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &p->renderer))
     {
@@ -709,16 +773,8 @@ static int preinit(struct vo *vo)
         goto fail;
     }
 
-    if (update_display_size(vo) < 0)
-        goto fail;
-
     vc_tv_register_callback(tv_callback, vo);
 
-    pthread_mutex_init(&p->vsync_mutex, NULL);
-    pthread_cond_init(&p->vsync_cond, NULL);
-
-    vc_dispmanx_vsync_callback(p->display, vsync_callback, vo);
-
     return 0;
 
 fail:
diff --git a/video/out/vo_sdl.c b/video/out/vo_sdl.c
index 9d34564..dd18f6e 100644
--- a/video/out/vo_sdl.c
+++ b/video/out/vo_sdl.c
@@ -58,30 +58,27 @@ const struct formatmap_entry formats[] = {
     {SDL_PIXELFORMAT_UYVY, IMGFMT_UYVY, 0},
     //{SDL_PIXELFORMAT_YVYU, IMGFMT_YVYU, 0},
 #if BYTE_ORDER == BIG_ENDIAN
-    {SDL_PIXELFORMAT_RGBX8888, IMGFMT_RGBA, 0}, // has no alpha -> bad for OSD
-    {SDL_PIXELFORMAT_BGRX8888, IMGFMT_BGRA, 0}, // has no alpha -> bad for OSD
+    {SDL_PIXELFORMAT_RGB888, IMGFMT_0RGB, 0}, // RGB888 means XRGB8888
+    {SDL_PIXELFORMAT_RGBX8888, IMGFMT_RGB0, 0}, // has no alpha -> bad for OSD
+    {SDL_PIXELFORMAT_BGR888, IMGFMT_0BGR, 0}, // BGR888 means XBGR8888
+    {SDL_PIXELFORMAT_BGRX8888, IMGFMT_BGR0, 0}, // has no alpha -> bad for OSD
     {SDL_PIXELFORMAT_ARGB8888, IMGFMT_ARGB, 1}, // matches SUBBITMAP_RGBA
     {SDL_PIXELFORMAT_RGBA8888, IMGFMT_RGBA, 1},
     {SDL_PIXELFORMAT_ABGR8888, IMGFMT_ABGR, 1},
     {SDL_PIXELFORMAT_BGRA8888, IMGFMT_BGRA, 1},
-    {SDL_PIXELFORMAT_RGB24, IMGFMT_RGB24, 0},
-    {SDL_PIXELFORMAT_BGR24, IMGFMT_BGR24, 0},
-    {SDL_PIXELFORMAT_RGB888, IMGFMT_RGB24, 0},
-    {SDL_PIXELFORMAT_BGR888, IMGFMT_BGR24, 0},
-    {SDL_PIXELFORMAT_BGR565, IMGFMT_RGB565, 0},
 #else
-    {SDL_PIXELFORMAT_RGBX8888, IMGFMT_ABGR, 0}, // has no alpha -> bad for OSD
-    {SDL_PIXELFORMAT_BGRX8888, IMGFMT_ARGB, 0}, // has no alpha -> bad for OSD
+    {SDL_PIXELFORMAT_RGB888, IMGFMT_BGR0, 0}, // RGB888 means XRGB8888
+    {SDL_PIXELFORMAT_RGBX8888, IMGFMT_0BGR, 0}, // has no alpha -> bad for OSD
+    {SDL_PIXELFORMAT_BGR888, IMGFMT_RGB0, 0}, // BGR888 means XBGR8888
+    {SDL_PIXELFORMAT_BGRX8888, IMGFMT_0RGB, 0}, // has no alpha -> bad for OSD
     {SDL_PIXELFORMAT_ARGB8888, IMGFMT_BGRA, 1}, // matches SUBBITMAP_RGBA
     {SDL_PIXELFORMAT_RGBA8888, IMGFMT_ABGR, 1},
     {SDL_PIXELFORMAT_ABGR8888, IMGFMT_RGBA, 1},
     {SDL_PIXELFORMAT_BGRA8888, IMGFMT_ARGB, 1},
+#endif
     {SDL_PIXELFORMAT_RGB24, IMGFMT_RGB24, 0},
     {SDL_PIXELFORMAT_BGR24, IMGFMT_BGR24, 0},
-    {SDL_PIXELFORMAT_RGB888, IMGFMT_BGR24, 0},
-    {SDL_PIXELFORMAT_BGR888, IMGFMT_RGB24, 0},
     {SDL_PIXELFORMAT_RGB565, IMGFMT_RGB565, 0},
-#endif
 };
 
 struct keymap_entry {
diff --git a/video/out/vo_vaapi.c b/video/out/vo_vaapi.c
index 5275d4d..dc8aaac 100644
--- a/video/out/vo_vaapi.c
+++ b/video/out/vo_vaapi.c
@@ -68,7 +68,6 @@ struct priv {
     struct vo               *vo;
     VADisplay                display;
     struct mp_vaapi_ctx     *mpvaapi;
-    struct mp_hwdec_info     hwdec_info;
 
     struct mp_image_params   image_params;
     struct mp_rect           src_rect;
@@ -515,11 +514,6 @@ static int control(struct vo *vo, uint32_t request, void *data)
     struct priv *p = vo->priv;
 
     switch (request) {
-    case VOCTRL_GET_HWDEC_INFO: {
-        struct mp_hwdec_info **arg = data;
-        *arg = &p->hwdec_info;
-        return true;
-    }
     case VOCTRL_SET_EQUALIZER: {
         struct voctrl_set_equalizer_args *eq = data;
         return set_equalizer(p, eq->name, eq->value);
@@ -561,6 +555,11 @@ static void uninit(struct vo *vo)
         free_subpicture(p, &part->image);
     }
 
+    if (vo->hwdec_devs) {
+        hwdec_devices_remove(vo->hwdec_devs, &p->mpvaapi->hwctx);
+        hwdec_devices_destroy(vo->hwdec_devs);
+    }
+
     va_destroy(p->mpvaapi);
 
     vo_x11_uninit(vo);
@@ -591,8 +590,6 @@ static int preinit(struct vo *vo)
         goto fail;
     }
 
-    p->hwdec_info.hwctx = &p->mpvaapi->hwctx;
-
     if (va_guess_if_emulated(p->mpvaapi)) {
         MP_WARN(vo, "VA-API is most likely emulated via VDPAU.\n"
                     "It's better to use VDPAU directly with: --vo=vdpau\n");
@@ -645,6 +642,10 @@ static int preinit(struct vo *vo)
             p->va_num_display_attrs = 0;
         p->mp_display_attr = talloc_zero_array(vo, int, p->va_num_display_attrs);
     }
+
+    vo->hwdec_devs = hwdec_devices_create();
+    hwdec_devices_add(vo->hwdec_devs, &p->mpvaapi->hwctx);
+
     return 0;
 
 fail:
diff --git a/video/out/vo_vdpau.c b/video/out/vo_vdpau.c
index b85780e..15472b2 100644
--- a/video/out/vo_vdpau.c
+++ b/video/out/vo_vdpau.c
@@ -71,7 +71,6 @@ struct vdpctx {
     struct vdp_functions              *vdp;
     VdpDevice                          vdp_device;
     uint64_t                           preemption_counter;
-    struct mp_hwdec_info               hwdec_info;
 
     struct m_color                     colorkey;
 
@@ -448,7 +447,6 @@ static void mark_vdpau_objects_uninitialized(struct vo *vo)
 
     forget_frames(vo, false);
     vc->black_pixel = VDP_INVALID_HANDLE;
-    vc->video_mixer->video_mixer = VDP_INVALID_HANDLE;
     vc->flip_queue = VDP_INVALID_HANDLE;
     vc->flip_target = VDP_INVALID_HANDLE;
     for (int i = 0; i < MAX_OUTPUT_SURFACES; i++)
@@ -1029,6 +1027,9 @@ static void uninit(struct vo *vo)
 {
     struct vdpctx *vc = vo->priv;
 
+    hwdec_devices_remove(vo->hwdec_devs, &vc->mpvdp->hwctx);
+    hwdec_devices_destroy(vo->hwdec_devs);
+
     /* Destroy all vdpau objects */
     mp_vdpau_mixer_destroy(vc->video_mixer);
     destroy_vdpau_objects(vo);
@@ -1054,7 +1055,8 @@ static int preinit(struct vo *vo)
         return -1;
     }
 
-    vc->hwdec_info.hwctx = &vc->mpvdp->hwctx;
+    vo->hwdec_devs = hwdec_devices_create();
+    hwdec_devices_add(vo->hwdec_devs, &vc->mpvdp->hwctx);
 
     vc->video_mixer = mp_vdpau_mixer_create(vc->mpvdp, vo->log);
 
@@ -1118,11 +1120,6 @@ static int control(struct vo *vo, uint32_t request, void *data)
     check_preemption(vo);
 
     switch (request) {
-    case VOCTRL_GET_HWDEC_INFO: {
-        struct mp_hwdec_info **arg = data;
-        *arg = &vc->hwdec_info;
-        return true;
-    }
     case VOCTRL_GET_PANSCAN:
         return VO_TRUE;
     case VOCTRL_SET_PANSCAN:
diff --git a/video/out/vo_wayland.c b/video/out/vo_wayland.c
index 57d6c7f..2997b38 100644
--- a/video/out/vo_wayland.c
+++ b/video/out/vo_wayland.c
@@ -249,10 +249,15 @@ static bool resize(struct priv *p)
     if (!p->video_bufpool.back_buffer || SHM_BUFFER_IS_BUSY(p->video_bufpool.back_buffer))
         return false; // skip resizing if we can't guarantee pixel perfectness!
 
+    int32_t scale = 1;
     int32_t x = wl->window.sh_x;
     int32_t y = wl->window.sh_y;
-    wl->vo->dwidth = wl->window.sh_width;
-    wl->vo->dheight = wl->window.sh_height;
+
+    if (wl->display.current_output)
+        scale = wl->display.current_output->scale;
+
+    wl->vo->dwidth = scale*wl->window.sh_width;
+    wl->vo->dheight = scale*wl->window.sh_height;
 
     vo_get_src_dst_rects(p->vo, &p->src, &p->dst, &p->osd);
     p->src_w = p->src.x1 - p->src.x0;
@@ -273,6 +278,7 @@ static bool resize(struct priv *p)
     if (y != 0)
         y = wl->window.height - p->dst_h;
 
+    wl_surface_set_buffer_scale(wl->window.video_surface, scale);
     mp_sws_set_from_cmdline(p->sws, p->vo->opts->sws_opts);
     p->sws->src = p->in_format;
     p->sws->dst = (struct mp_image_params) {
@@ -301,7 +307,7 @@ static bool resize(struct priv *p)
     if (!p->enable_alpha) {
         struct wl_region *opaque =
             wl_compositor_create_region(wl->display.compositor);
-        wl_region_add(opaque, 0, 0, p->dst_w, p->dst_h);
+        wl_region_add(opaque, 0, 0, p->dst_w/scale, p->dst_h/scale);
         wl_surface_set_opaque_region(wl->window.video_surface, opaque);
         wl_region_destroy(opaque);
     }
@@ -464,14 +470,19 @@ static const bool osd_formats[SUBBITMAP_COUNT] = {
 
 static void draw_osd(struct vo *vo)
 {
+    int32_t scale = 1;
     struct priv *p = vo->priv;
 
+    if (p->wl && p->wl->display.current_output)
+        scale = p->wl->display.current_output->scale;
+
     // detach all buffers and attach all needed buffers in osd_draw
     // only the most recent attach & commit is applied once the parent surface
     // is committed
     for (int i = 0; i < MAX_OSD_PARTS; ++i) {
         struct wl_surface *s = p->osd_surfaces[i];
         wl_surface_attach(s, NULL, 0, 0);
+        wl_surface_set_buffer_scale(s, scale);
         wl_surface_damage(s, 0, 0, p->dst_w, p->dst_h);
         wl_surface_commit(s);
     }
diff --git a/video/out/vo_xv.c b/video/out/vo_xv.c
index e02ea2d..1e7ae7c 100644
--- a/video/out/vo_xv.c
+++ b/video/out/vo_xv.c
@@ -577,6 +577,15 @@ static bool allocate_xvimage(struct vo *vo, int foo)
             return false;
         XSync(x11->display, False);
     }
+
+    if ((ctx->xvimage[foo]->width != aligned_w) ||
+        (ctx->xvimage[foo]->height != ctx->image_height)) {
+        MP_ERR(vo, "Got XvImage with incorrect size: %ux%u (expected %ux%u)\n",
+               ctx->xvimage[foo]->width, ctx->xvimage[foo]->height,
+               aligned_w, ctx->image_height);
+        return false;
+    }
+
     struct mp_image img = get_xv_buffer(vo, foo);
     img.w = aligned_w;
     mp_image_clear(&img, 0, 0, img.w, img.h);
diff --git a/video/out/w32_common.c b/video/out/w32_common.c
index d26de3b..f3b59f1 100644
--- a/video/out/w32_common.c
+++ b/video/out/w32_common.c
@@ -15,13 +15,13 @@
  * with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <initguid.h>
 #include <stdio.h>
 #include <limits.h>
 #include <pthread.h>
 #include <assert.h>
 #include <windows.h>
 #include <windowsx.h>
-#include <initguid.h>
 #include <ole2.h>
 #include <shobjidl.h>
 #include <avrt.h>
@@ -45,6 +45,9 @@
 #include "misc/rendezvous.h"
 #include "mpv_talloc.h"
 
+EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+#define HINST_THISCOMPONENT ((HINSTANCE)&__ImageBase)
+
 static const wchar_t classname[] = L"mpv";
 
 static __thread struct vo_w32_state *w32_thread_context;
@@ -1067,15 +1070,31 @@ static void reinit_window_state(struct vo_w32_state *w32)
 
     RECT cr = r;
     add_window_borders(w32->window, &r);
+    // Check on client area size instead of window size on --fit-border=no
+    long o_w;
+    long o_h;
+    if( w32->opts->fit_border ) {
+        o_w = r.right - r.left;
+        o_h = r.bottom - r.top;
+    } else {
+        o_w = cr.right - cr.left;
+        o_h = cr.bottom - cr.top;
+    }
 
-    if (!w32->current_fs &&
-        ((r.right - r.left) >= screen_w || (r.bottom - r.top) >= screen_h))
+    if ( !w32->current_fs && ( o_w > screen_w || o_h > screen_h ) )
     {
         MP_VERBOSE(w32, "requested window size larger than the screen\n");
         // Use the aspect of the client area, not the full window size.
         // Basically, try to compute the maximum window size.
-        long n_w = screen_w - (r.right - cr.right) - (cr.left - r.left) - 1;
-        long n_h = screen_h - (r.bottom - cr.bottom) - (cr.top - r.top) - 1;
+        long n_w;
+        long n_h;
+        if( w32->opts->fit_border ) {
+            n_w = screen_w - (r.right - cr.right) - (cr.left - r.left);
+            n_h = screen_h - (r.bottom - cr.bottom) - (cr.top - r.top);
+        } else {
+            n_w = screen_w;
+            n_h = screen_h;
+        }
         // Letterbox
         double asp = (cr.right - cr.left) / (double)(cr.bottom - cr.top);
         double s_asp = n_w / (double)n_h;
@@ -1084,15 +1103,28 @@ static void reinit_window_state(struct vo_w32_state *w32)
         } else {
             n_w = n_h * asp;
         }
+        // Save new size
+        w32->dw = n_w;
+        w32->dh = n_h;
+        // Get old window center
+        long o_cx = r.left + (r.right - r.left) / 2;
+        long o_cy = r.top + (r.bottom - r.top) / 2;
+        // Add window borders to the new window size
         r = (RECT){.right = n_w, .bottom = n_h};
         add_window_borders(w32->window, &r);
-        // Center the final window
+        // Get top and left border size for client area position calculation
+        long b_top = -r.top;
+        long b_left = -r.left;
+        // Center the final window around the old window center
         n_w = r.right - r.left;
         n_h = r.bottom - r.top;
-        r.left = w32->screenrc.x0 + screen_w / 2 - n_w / 2;
-        r.top = w32->screenrc.y0 + screen_h / 2 - n_h / 2;
+        r.left = o_cx - n_w / 2;
+        r.top = o_cy - n_h / 2;
         r.right = r.left + n_w;
         r.bottom = r.top + n_h;
+        // Save new client area position
+        w32->window_x = r.left + b_left;
+        w32->window_y = r.top + b_top;
     }
 
     MP_VERBOSE(w32, "reset window bounds: %d:%d:%d:%d\n",
@@ -1116,6 +1148,7 @@ static void gui_thread_reconfig(void *ptr)
     vo_apply_window_geometry(vo, &geo);
 
     bool reset_size = w32->o_dwidth != vo->dwidth || w32->o_dheight != vo->dheight;
+    bool pos_init = false;
 
     w32->o_dwidth = vo->dwidth;
     w32->o_dheight = vo->dheight;
@@ -1132,6 +1165,7 @@ static void gui_thread_reconfig(void *ptr)
         } else {
             w32->window_bounds_initialized = true;
             reset_size = true;
+            pos_init = true;
             w32->window_x = w32->prev_x = geo.win.x0;
             w32->window_y = w32->prev_y = geo.win.y0;
         }
@@ -1147,6 +1181,12 @@ static void gui_thread_reconfig(void *ptr)
         vo->dheight = r.bottom;
     }
 
+    // Recenter window around old position on new video size
+    // excluding the case when initial positon handled by win_state.
+    if (!pos_init) {
+        w32->window_x += w32->dw / 2 - vo->dwidth / 2;
+        w32->window_y += w32->dh / 2 - vo->dheight / 2;
+    }
     w32->dw = vo->dwidth;
     w32->dh = vo->dheight;
 
@@ -1184,14 +1224,12 @@ static void *gui_thread(void *ptr)
 
     thread_disable_ime();
 
-    HINSTANCE hInstance = GetModuleHandleW(NULL);
-
     WNDCLASSEXW wcex = {
         .cbSize = sizeof wcex,
         .style = CS_HREDRAW | CS_VREDRAW,
         .lpfnWndProc = WndProc,
-        .hInstance = hInstance,
-        .hIcon = LoadIconW(hInstance, L"IDI_ICON1"),
+        .hInstance = HINST_THISCOMPONENT,
+        .hIcon = LoadIconW(HINST_THISCOMPONENT, L"IDI_ICON1"),
         .hCursor = LoadCursor(NULL, IDC_ARROW),
         .lpszClassName = classname,
     };
@@ -1209,13 +1247,13 @@ static void *gui_thread(void *ptr)
                                       classname,
                                       WS_CHILD | WS_VISIBLE,
                                       0, 0, r.right, r.bottom,
-                                      w32->parent, 0, hInstance, NULL);
+                                      w32->parent, 0, HINST_THISCOMPONENT, NULL);
     } else {
         w32->window = CreateWindowExW(0, classname,
                                       classname,
                                       update_style(w32, 0),
                                       CW_USEDEFAULT, SW_HIDE, 100, 100,
-                                      0, 0, hInstance, NULL);
+                                      0, 0, HINST_THISCOMPONENT, NULL);
     }
 
     if (!w32->window) {
@@ -1374,9 +1412,13 @@ static int gui_thread_control(struct vo_w32_state *w32, int request, void *arg)
         if (!w32->window_bounds_initialized)
             return VO_FALSE;
         if (w32->current_fs) {
+            w32->prev_x += w32->prev_width / 2 - s[0] / 2;
+            w32->prev_y += w32->prev_height / 2 - s[1] / 2;
             w32->prev_width = s[0];
             w32->prev_height = s[1];
         } else {
+            w32->window_x += w32->dw / 2 - s[0] / 2;
+            w32->window_y += w32->dh / 2 - s[1] / 2;
             w32->dw = s[0];
             w32->dh = s[1];
         }
@@ -1419,7 +1461,7 @@ static int gui_thread_control(struct vo_w32_state *w32, int request, void *arg)
         if (!w32->taskbar_list3 || !w32->tbtnCreated)
             return VO_TRUE;
 
-        if (!pstate->playing) {
+        if (!pstate->playing || !pstate->taskbar_progress) {
             ITaskbarList3_SetProgressState(w32->taskbar_list3, w32->window,
                                            TBPF_NOPROGRESS);
             return VO_TRUE;
diff --git a/video/out/wayland_common.c b/video/out/wayland_common.c
index b9dac90..0e44ddd 100644
--- a/video/out/wayland_common.c
+++ b/video/out/wayland_common.c
@@ -189,9 +189,22 @@ static void output_handle_mode(void *data,
     output->refresh_rate = refresh;
 }
 
+static void output_handle_done(void* data, struct wl_output *wl_output)
+{
+}
+
+static void output_handle_scale(void* data, struct wl_output *wl_output,
+                                int32_t factor)
+{
+    struct vo_wayland_output *output = data;
+    output->scale = factor;
+}
+
 static const struct wl_output_listener output_listener = {
     output_handle_geometry,
-    output_handle_mode
+    output_handle_mode,
+    output_handle_done,
+    output_handle_scale
 };
 
 
@@ -211,6 +224,8 @@ static void surface_handle_enter(void *data,
             break;
         }
     }
+
+    wl->window.events |= VO_EVENT_WIN_STATE;
 }
 
 static void surface_handle_leave(void *data,
@@ -401,11 +416,15 @@ static void pointer_handle_motion(void *data,
                                   wl_fixed_t sx_w,
                                   wl_fixed_t sy_w)
 {
+    int32_t scale = 1;
     struct vo_wayland_state *wl = data;
 
+    if (wl->display.current_output)
+        scale = wl->display.current_output->scale;
+
     wl->cursor.pointer = pointer;
-    wl->window.mouse_x = wl_fixed_to_int(sx_w);
-    wl->window.mouse_y = wl_fixed_to_int(sy_w);
+    wl->window.mouse_x = scale*wl_fixed_to_int(sx_w);
+    wl->window.mouse_y = scale*wl_fixed_to_int(sy_w);
 
     mp_input_set_mouse_pos(wl->vo->input_ctx, wl->window.mouse_x,
                                               wl->window.mouse_y);
@@ -521,7 +540,7 @@ static void data_device_handle_data_offer(void *data,
 {
     struct vo_wayland_state *wl = data;
     if (wl->input.offer) {
-        MP_ERR(wl, "There is already a dnd entry point.\n");
+        MP_DBG(wl, "There is already a dnd entry point.\n");
         wl_data_offer_destroy(wl->input.offer);
     }
 
@@ -606,7 +625,8 @@ static void registry_handle_global (void *data,
     if (strcmp(interface, "wl_compositor") == 0) {
 
         wl->display.compositor = wl_registry_bind(reg, id,
-                                                  &wl_compositor_interface, 1);
+                                                  &wl_compositor_interface,
+                                                  MPMIN(3, version));
     }
 
     else if (strcmp(interface, "wl_shell") == 0) {
@@ -625,7 +645,9 @@ static void registry_handle_global (void *data,
             talloc_zero(wl, struct vo_wayland_output);
 
         output->id = id;
-        output->output = wl_registry_bind(reg, id, &wl_output_interface, 1);
+        output->scale = 1;
+        output->output = wl_registry_bind(reg, id, &wl_output_interface,
+                                          MPMIN(2, version));
 
         wl_output_add_listener(output->output, &output_listener, output);
         wl_list_insert(&wl->display.output_list, &output->link);
@@ -739,7 +761,6 @@ static void schedule_resize(struct vo_wayland_state *wl,
 {
     int32_t minimum_size = 150;
     int32_t x, y;
-    float temp_aspect = width / (float) MPMAX(height, 1);
     float win_aspect = wl->window.aspect;
     if (win_aspect <= 0)
         win_aspect = 1;
@@ -770,12 +791,6 @@ static void schedule_resize(struct vo_wayland_state *wl,
         case WL_SHELL_SURFACE_RESIZE_BOTTOM_RIGHT:
             height = (1 / win_aspect) * width;
             break;
-        default:
-            if (wl->window.aspect < temp_aspect)
-                width = wl->window.aspect * height;
-            else
-                height = (1 / win_aspect) * width;
-            break;
     }
 
     if (edges & WL_SHELL_SURFACE_RESIZE_LEFT)
@@ -792,7 +807,7 @@ static void schedule_resize(struct vo_wayland_state *wl,
     wl->window.sh_height = height;
     wl->window.sh_x = x;
     wl->window.sh_y = y;
-    wl->window.events |= VO_EVENT_RESIZE;
+    wl->window.events |= VO_EVENT_WIN_STATE | VO_EVENT_RESIZE;
     wl->vo->dwidth = width;
     wl->vo->dheight = height;
 }
@@ -1023,10 +1038,11 @@ int vo_wayland_init (struct vo *vo)
                        "\tvendor: %s\n"
                        "\tmodel: %s\n"
                        "\tw: %d, h: %d\n"
-                       "\tHz: %d\n",
+                       "\tscale: %d\n"
+                       "\tHz: %f\n",
                        o->make, o->model,
-                       o->width, o->height,
-                       o->refresh_rate / 1000);
+                       o->width, o->height, o->scale,
+                       o->refresh_rate / 1000.0f);
     }
 
     vo->event_fd = wl->display.display_fd;
@@ -1276,7 +1292,7 @@ int vo_wayland_control (struct vo *vo, int *events, int request, void *arg)
             break;
 
         // refresh rate is stored in milli-Hertz (mHz)
-        double fps = wl->display.current_output->refresh_rate / 1000;
+        double fps = wl->display.current_output->refresh_rate / 1000.0f;
         *(double*) arg = fps;
         return VO_TRUE;
     }
diff --git a/video/out/wayland_common.h b/video/out/wayland_common.h
index d23b2f2..ec3f72c 100644
--- a/video/out/wayland_common.h
+++ b/video/out/wayland_common.h
@@ -41,6 +41,7 @@ struct vo_wayland_output {
     uint32_t flags;
     int32_t width;
     int32_t height;
+    int32_t scale;
     int32_t refresh_rate; // fps (mHz)
     const char *make;
     const char *model;
diff --git a/video/out/x11_common.c b/video/out/x11_common.c
index 034f785..647a910 100644
--- a/video/out/x11_common.c
+++ b/video/out/x11_common.c
@@ -280,6 +280,9 @@ static void vo_set_cursor_hidden(struct vo *vo, bool cursor_hidden)
 static int x11_errorhandler(Display *display, XErrorEvent *event)
 {
     struct mp_log *log = x11_error_output;
+    if (!log)
+        return 0;
+
     char msg[60];
     XGetErrorText(display, event->error_code, (char *) &msg, sizeof(msg));
 
@@ -746,8 +749,8 @@ void vo_x11_uninit(struct vo *vo)
     if (x11->xim)
         XCloseIM(x11->xim);
     if (x11->display) {
-        x11_error_output = NULL;
         XSetErrorHandler(NULL);
+        x11_error_output = NULL;
         XCloseDisplay(x11->display);
     }
 
@@ -950,6 +953,22 @@ static int get_mods(unsigned int state)
     return modifiers;
 }
 
+static void vo_x11_update_composition_hint(struct vo *vo)
+{
+    struct vo_x11_state *x11 = vo->x11;
+
+    long hint = 0;
+    switch (vo->opts->x11_bypass_compositor) {
+    case 0: hint = 0; break; // leave default
+    case 1: hint = 1; break; // always bypass
+    case 2: hint = x11->fs ? 1 : 0; break; // bypass in FS
+    case 3: hint = 2; break; // always enable
+    }
+
+    XChangeProperty(x11->display, x11->window, XA(x11,_NET_WM_BYPASS_COMPOSITOR),
+                    XA_CARDINAL, 32, PropModeReplace, (unsigned char *)&hint, 1);
+}
+
 static void vo_x11_check_net_wm_state_fullscreen_change(struct vo *vo)
 {
     struct vo_x11_state *x11 = vo->x11;
@@ -986,6 +1005,8 @@ static void vo_x11_check_net_wm_state_fullscreen_change(struct vo *vo)
 
             x11->size_changed_during_fs = false;
             x11->pos_changed_during_fs = false;
+
+            vo_x11_update_composition_hint(vo);
         }
     }
 }
@@ -1437,15 +1458,11 @@ static void vo_x11_create_window(struct vo *vo, XVisualInfo *vis,
     }
 
     if (!x11->parent) {
-        if (vo->opts->x11_bypass_compositor) {
-            long v = 1; // request disabling compositor
-            XChangeProperty(x11->display, x11->window,
-                XA(x11,_NET_WM_BYPASS_COMPOSITOR), XA_CARDINAL, 32,
-                PropModeReplace, (unsigned char *)&v, 1);
-        }
+        vo_x11_update_composition_hint(vo);
         vo_x11_set_wm_icon(x11);
         vo_x11_update_window_title(vo);
         vo_x11_dnd_init_window(vo);
+        vo_x11_set_property_utf8(vo, XA(x11, _GTK_THEME_VARIANT), "dark");
     }
     vo_x11_xembed_update(x11, 0);
 }
@@ -1485,6 +1502,8 @@ static void vo_x11_map_window(struct vo *vo, struct mp_rect rc)
                         XA_CARDINAL, 32, PropModeReplace, (unsigned char *)&v, 1);
     }
 
+    vo_x11_update_composition_hint(vo);
+
     // map window
     int events = StructureNotifyMask | ExposureMask | PropertyChangeMask |
                  LeaveWindowMask | EnterWindowMask;
@@ -1731,6 +1750,8 @@ static void vo_x11_fullscreen(struct vo *vo)
 
     x11->size_changed_during_fs = false;
     x11->pos_changed_during_fs = false;
+
+    vo_x11_update_composition_hint(vo);
 }
 
 int vo_x11_control(struct vo *vo, int *events, int request, void *arg)
diff --git a/video/vaapi.c b/video/vaapi.c
index 61d94ef..f8d0fab 100644
--- a/video/vaapi.c
+++ b/video/vaapi.c
@@ -128,8 +128,7 @@ struct mp_vaapi_ctx *va_initialize(VADisplay *display, struct mp_log *plog,
         .display = display,
         .hwctx = {
             .type = HWDEC_VAAPI,
-            .priv = res,
-            .vaapi_ctx = res,
+            .ctx = res,
             .download_image = ctx_download_image,
         },
     };
@@ -487,6 +486,38 @@ struct mp_image *va_surface_download(struct mp_image *src,
     return NULL;
 }
 
+// Set the hw_subfmt from the surface's real format. Because of this bug:
+//      https://bugs.freedesktop.org/show_bug.cgi?id=79848
+// it should be assumed that the real format is only known after an arbitrary
+// vaCreateContext() call has been made, or even better, after the surface
+// has been rendered to.
+// If the hw_subfmt is already set, this is a NOP.
+void va_surface_init_subformat(struct mp_image *mpi)
+{
+    VAStatus status;
+    if (mpi->params.hw_subfmt)
+        return;
+    struct va_surface *p = va_surface_in_mp_image(mpi);
+    if (!p)
+        return;
+
+    VAImage va_image = { .image_id = VA_INVALID_ID };
+
+    va_lock(p->ctx);
+
+    status = vaDeriveImage(p->display, va_surface_id(mpi), &va_image);
+    if (status != VA_STATUS_SUCCESS)
+        goto err;
+
+    mpi->params.hw_subfmt = va_image.format.fourcc;
+
+    status = vaDestroyImage(p->display, va_image.image_id);
+    CHECK_VA_STATUS(p->ctx, "vaDestroyImage()");
+
+err:
+    va_unlock(p->ctx);
+}
+
 struct pool_alloc_ctx {
     struct mp_vaapi_ctx *vaapi;
     int rt_format;
diff --git a/video/vaapi.h b/video/vaapi.h
index 11ff2c9..3f0d1dc 100644
--- a/video/vaapi.h
+++ b/video/vaapi.h
@@ -69,6 +69,8 @@ struct mp_image *va_surface_download(struct mp_image *src,
 int va_surface_alloc_imgfmt(struct mp_image *img, int imgfmt);
 int va_surface_upload(struct mp_image *va_dst, struct mp_image *sw_src);
 
+void va_surface_init_subformat(struct mp_image *mpi);
+
 bool va_guess_if_emulated(struct mp_vaapi_ctx *ctx);
 
 #endif
diff --git a/video/vdpau.c b/video/vdpau.c
index 9dfbc2b..dffb02e 100644
--- a/video/vdpau.c
+++ b/video/vdpau.c
@@ -28,11 +28,61 @@
 #include "mp_image_pool.h"
 #include "vdpau_mixer.h"
 
+static struct mp_image *download_image_yuv(struct mp_hwdec_ctx *hwctx,
+                                           struct mp_image *mpi,
+                                           struct mp_image_pool *swpool)
+{
+    struct mp_vdpau_ctx *ctx = hwctx->ctx;
+    struct vdp_functions *vdp = &ctx->vdp;
+    VdpStatus vdp_st;
+
+    if (mpi->imgfmt != IMGFMT_VDPAU || mp_vdpau_mixed_frame_get(mpi))
+        return NULL;
+
+    VdpVideoSurface surface = (uintptr_t)mpi->planes[3];
+
+    VdpChromaType s_chroma_type;
+    uint32_t s_w, s_h;
+    vdp_st = vdp->video_surface_get_parameters(surface, &s_chroma_type, &s_w, &s_h);
+    CHECK_VDP_ERROR_NORETURN(ctx,
+                    "Error when calling vdp_video_surface_get_parameters");
+    if (vdp_st != VDP_STATUS_OK)
+        return NULL;
+
+    // Don't bother supporting other types for now.
+    if (s_chroma_type != VDP_CHROMA_TYPE_420)
+        return NULL;
+
+    // The allocation needs to be uncropped, because get_bits writes to it.
+    struct mp_image *out = mp_image_pool_get(swpool, IMGFMT_NV12, s_w, s_h);
+    if (!out)
+        return NULL;
+
+    mp_image_set_size(out, mpi->w, mpi->h);
+    mp_image_copy_attributes(out, mpi);
+
+    vdp_st = vdp->video_surface_get_bits_y_cb_cr(surface,
+                                                 VDP_YCBCR_FORMAT_NV12,
+                                                 (void * const *)out->planes,
+                                                 out->stride);
+    CHECK_VDP_ERROR_NORETURN(ctx,
+                "Error when calling vdp_output_surface_get_bits_y_cb_cr");
+    if (vdp_st != VDP_STATUS_OK) {
+        talloc_free(out);
+        return NULL;
+    }
+
+    return out;
+}
+
 static struct mp_image *download_image(struct mp_hwdec_ctx *hwctx,
                                        struct mp_image *mpi,
                                        struct mp_image_pool *swpool)
 {
-    struct mp_vdpau_ctx *ctx = hwctx->vdpau_ctx;
+    if (mpi->imgfmt != IMGFMT_VDPAU && mpi->imgfmt != IMGFMT_VDPAU_OUTPUT)
+        return NULL;
+
+    struct mp_vdpau_ctx *ctx = hwctx->ctx;
     struct vdp_functions *vdp = &ctx->vdp;
     VdpStatus vdp_st;
 
@@ -40,6 +90,10 @@ static struct mp_image *download_image(struct mp_hwdec_ctx *hwctx,
     int w, h;
     mp_image_params_get_dsize(&mpi->params, &w, &h);
 
+    res = download_image_yuv(hwctx, mpi, swpool);
+    if (res)
+        return res;
+
     // Abuse this lock for our own purposes. It could use its own lock instead.
     pthread_mutex_lock(&ctx->pool_lock);
 
@@ -268,8 +322,7 @@ static struct mp_image *create_ref(struct mp_vdpau_ctx *ctx, int index)
     struct surface_ref *ref = talloc_ptrtype(NULL, ref);
     *ref = (struct surface_ref){ctx, index};
     struct mp_image *res =
-        mp_image_new_custom_ref(&(struct mp_image){0}, ref,
-                                release_decoder_surface);
+        mp_image_new_custom_ref(NULL, ref, release_decoder_surface);
     if (res) {
         mp_image_setfmt(res, e->rgb ? IMGFMT_VDPAU_OUTPUT : IMGFMT_VDPAU);
         mp_image_set_size(res, e->w, e->h);
@@ -396,8 +449,7 @@ struct mp_vdpau_ctx *mp_vdpau_create_device_x11(struct mp_log *log, Display *x11
         .preemption_counter = 1,
         .hwctx = {
             .type = HWDEC_VDPAU,
-            .priv = ctx,
-            .vdpau_ctx = ctx,
+            .ctx = ctx,
             .download_image = download_image,
         },
         .getimg_surface = VDP_INVALID_HANDLE,
diff --git a/video/vdpau.h b/video/vdpau.h
index db73a87..389e1c7 100644
--- a/video/vdpau.h
+++ b/video/vdpau.h
@@ -23,6 +23,9 @@
 #define CHECK_VDP_ERROR(ctx, message) \
     CHECK_VDP_ERROR_ST(ctx, message, return -1;)
 
+#define CHECK_VDP_ERROR_NORETURN(ctx, message) \
+    CHECK_VDP_ERROR_ST(ctx, message, ;)
+
 #define CHECK_VDP_WARNING(ctx, message) \
     do { \
         if (vdp_st != VDP_STATUS_OK) \
diff --git a/video/vdpau_mixer.c b/video/vdpau_mixer.c
index 7025aef..d6f93a9 100644
--- a/video/vdpau_mixer.c
+++ b/video/vdpau_mixer.c
@@ -71,6 +71,7 @@ struct mp_vdpau_mixer *mp_vdpau_mixer_create(struct mp_vdpau_ctx *vdp_ctx,
             .capabilities = MP_CSP_EQ_CAPS_COLORMATRIX,
         },
     };
+    mp_vdpau_handle_preemption(mixer->ctx, &mixer->preemption_counter);
     return mixer;
 }
 
@@ -228,6 +229,13 @@ int mp_vdpau_mixer_render(struct mp_vdpau_mixer *mixer,
     if (!video_rect)
         video_rect = &fallback_rect;
 
+    int pe = mp_vdpau_handle_preemption(mixer->ctx, &mixer->preemption_counter);
+    if (pe < 1) {
+        mixer->video_mixer = VDP_INVALID_HANDLE;
+        if (pe < 0)
+            return -1;
+    }
+
     if (video->imgfmt == IMGFMT_VDPAU_OUTPUT) {
         VdpOutputSurface surface = (uintptr_t)video->planes[3];
         int flags = VDP_OUTPUT_SURFACE_RENDER_ROTATE_0;
diff --git a/video/vdpau_mixer.h b/video/vdpau_mixer.h
index 97bef86..716b57e 100644
--- a/video/vdpau_mixer.h
+++ b/video/vdpau_mixer.h
@@ -30,6 +30,7 @@ struct mp_vdpau_mixer_frame {
 struct mp_vdpau_mixer {
     struct mp_log *log;
     struct mp_vdpau_ctx *ctx;
+    uint64_t preemption_counter;
     bool initialized;
 
     struct mp_image_params image_params;
author	James Cowgill <james410@cowgill.org.uk>	2016-07-04 11:19:11 +0200
committer	James Cowgill <james410@cowgill.org.uk>	2016-07-04 11:19:11 +0200
commit	b3df5144ae0631b8634e535ba90245e8cdfd2a0a (patch)
tree	bc955df92f24b7140d3e0d4ec56edcfa74b32c5b /video
parent	36e11d485bf132c7ae9cf5c3433ae40d63adb54d (diff)