113 files changed, 14216 insertions, 5982 deletions
diff --git a/video/out/cocoa/window.m b/video/out/cocoa/window.m
index 6d464a1..2feaab9 100644
--- a/video/out/cocoa/window.m
+++ b/video/out/cocoa/window.m
@@ -386,8 +386,11 @@
 
 - (NSRect)constrainFrameRect:(NSRect)nf toScreen:(NSScreen *)screen
 {
-    if (_is_animating && ![self.adapter isInFullScreenMode])
+    if ((_is_animating && ![self.adapter isInFullScreenMode]) ||
+        (!_is_animating && [self.adapter isInFullScreenMode]))
+    {
         return nf;
+    }
 
     screen = screen ?: self.screen ?: [NSScreen mainScreen];
     NSRect of  = [self frame];
diff --git a/video/out/d3d11/context.c b/video/out/d3d11/context.c
new file mode 100644
index 0000000..b02d2e8
--- /dev/null
+++ b/video/out/d3d11/context.c
@@ -0,0 +1,244 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "common/msg.h"
+#include "options/m_config.h"
+#include "osdep/windows_utils.h"
+
+#include "video/out/gpu/context.h"
+#include "video/out/gpu/d3d11_helpers.h"
+#include "video/out/gpu/spirv.h"
+#include "video/out/w32_common.h"
+#include "ra_d3d11.h"
+
+struct d3d11_opts {
+    int feature_level;
+    int warp;
+    int flip;
+    int sync_interval;
+};
+
+#define OPT_BASE_STRUCT struct d3d11_opts
+const struct m_sub_options d3d11_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_CHOICE("d3d11-warp", warp, 0,
+                   ({"auto", -1},
+                    {"no", 0},
+                    {"yes", 1})),
+        OPT_CHOICE("d3d11-feature-level", feature_level, 0,
+                   ({"12_1", D3D_FEATURE_LEVEL_12_1},
+                    {"12_0", D3D_FEATURE_LEVEL_12_0},
+                    {"11_1", D3D_FEATURE_LEVEL_11_1},
+                    {"11_0", D3D_FEATURE_LEVEL_11_0},
+                    {"10_1", D3D_FEATURE_LEVEL_10_1},
+                    {"10_0", D3D_FEATURE_LEVEL_10_0},
+                    {"9_3", D3D_FEATURE_LEVEL_9_3},
+                    {"9_2", D3D_FEATURE_LEVEL_9_2},
+                    {"9_1", D3D_FEATURE_LEVEL_9_1})),
+        OPT_FLAG("d3d11-flip", flip, 0),
+        OPT_INTRANGE("d3d11-sync-interval", sync_interval, 0, 0, 4),
+        {0}
+    },
+    .defaults = &(const struct d3d11_opts) {
+        .feature_level = D3D_FEATURE_LEVEL_12_1,
+        .warp = -1,
+        .flip = 1,
+        .sync_interval = 1,
+    },
+    .size = sizeof(struct d3d11_opts)
+};
+
+struct priv {
+    struct d3d11_opts *opts;
+
+    struct ra_tex *backbuffer;
+    ID3D11Device *device;
+    IDXGISwapChain *swapchain;
+};
+
+static struct mp_image *d3d11_screenshot(struct ra_swapchain *sw)
+{
+    struct priv *p = sw->ctx->priv;
+    if (!p->swapchain)
+        return NULL;
+    return mp_d3d11_screenshot(p->swapchain);
+}
+
+static struct ra_tex *get_backbuffer(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ID3D11Texture2D *backbuffer = NULL;
+    struct ra_tex *tex = NULL;
+    HRESULT hr;
+
+    hr = IDXGISwapChain_GetBuffer(p->swapchain, 0, &IID_ID3D11Texture2D,
+                                  (void**)&backbuffer);
+    if (FAILED(hr)) {
+        MP_ERR(ctx, "Couldn't get swapchain image\n");
+        goto done;
+    }
+
+    tex = ra_d3d11_wrap_tex(ctx->ra, (ID3D11Resource *)backbuffer);
+done:
+    SAFE_RELEASE(backbuffer);
+    return tex;
+}
+
+static bool resize(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    HRESULT hr;
+
+    ra_tex_free(ctx->ra, &p->backbuffer);
+
+    hr = IDXGISwapChain_ResizeBuffers(p->swapchain, 0, ctx->vo->dwidth,
+        ctx->vo->dheight, DXGI_FORMAT_UNKNOWN, 0);
+    if (FAILED(hr)) {
+        MP_FATAL(ctx, "Couldn't resize swapchain: %s\n", mp_HRESULT_to_str(hr));
+        return false;
+    }
+
+    p->backbuffer = get_backbuffer(ctx);
+
+    return true;
+}
+
+static bool d3d11_reconfig(struct ra_ctx *ctx)
+{
+    vo_w32_config(ctx->vo);
+    return resize(ctx);
+}
+
+static int d3d11_color_depth(struct ra_swapchain *sw)
+{
+    return 8;
+}
+
+static bool d3d11_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
+{
+    struct priv *p = sw->priv;
+    *out_fbo = (struct ra_fbo) {
+        .tex = p->backbuffer,
+        .flip = false,
+    };
+    return true;
+}
+
+static bool d3d11_submit_frame(struct ra_swapchain *sw,
+                               const struct vo_frame *frame)
+{
+    ra_d3d11_flush(sw->ctx->ra);
+    return true;
+}
+
+static void d3d11_swap_buffers(struct ra_swapchain *sw)
+{
+    struct priv *p = sw->priv;
+    IDXGISwapChain_Present(p->swapchain, p->opts->sync_interval, 0);
+}
+
+static int d3d11_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    int ret = vo_w32_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE) {
+        if (!resize(ctx))
+            return VO_ERROR;
+    }
+    return ret;
+}
+
+static void d3d11_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    ra_tex_free(ctx->ra, &p->backbuffer);
+    SAFE_RELEASE(p->swapchain);
+    vo_w32_uninit(ctx->vo);
+    SAFE_RELEASE(p->device);
+
+    // Destory the RA last to prevent objects we hold from showing up in D3D's
+    // leak checker
+    ctx->ra->fns->destroy(ctx->ra);
+}
+
+static const struct ra_swapchain_fns d3d11_swapchain = {
+    .color_depth  = d3d11_color_depth,
+    .screenshot   = d3d11_screenshot,
+    .start_frame  = d3d11_start_frame,
+    .submit_frame = d3d11_submit_frame,
+    .swap_buffers = d3d11_swap_buffers,
+};
+
+static bool d3d11_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    p->opts = mp_get_config_group(ctx, ctx->global, &d3d11_conf);
+
+    struct ra_swapchain *sw = ctx->swapchain = talloc_zero(ctx, struct ra_swapchain);
+    sw->priv = p;
+    sw->ctx = ctx;
+    sw->fns = &d3d11_swapchain;
+
+    struct d3d11_device_opts dopts = {
+        .debug = ctx->opts.debug,
+        .allow_warp = p->opts->warp != 0,
+        .force_warp = p->opts->warp == 1,
+        .max_feature_level = p->opts->feature_level,
+        .max_frame_latency = ctx->opts.swapchain_depth,
+    };
+    if (!mp_d3d11_create_present_device(ctx->log, &dopts, &p->device))
+        goto error;
+
+    if (!spirv_compiler_init(ctx))
+        goto error;
+    ctx->ra = ra_d3d11_create(p->device, ctx->log, ctx->spirv);
+    if (!ctx->ra)
+        goto error;
+
+    if (!vo_w32_init(ctx->vo))
+        goto error;
+
+    struct d3d11_swapchain_opts scopts = {
+        .window = vo_w32_hwnd(ctx->vo),
+        .width = ctx->vo->dwidth,
+        .height = ctx->vo->dheight,
+        .flip = p->opts->flip,
+        // Add one frame for the backbuffer and one frame of "slack" to reduce
+        // contention with the window manager when acquiring the backbuffer
+        .length = ctx->opts.swapchain_depth + 2,
+        .usage = DXGI_USAGE_RENDER_TARGET_OUTPUT,
+    };
+    if (!mp_d3d11_create_swapchain(p->device, ctx->log, &scopts, &p->swapchain))
+        goto error;
+
+    p->backbuffer = get_backbuffer(ctx);
+
+    return true;
+
+error:
+    d3d11_uninit(ctx);
+    return false;
+}
+
+const struct ra_ctx_fns ra_ctx_d3d11 = {
+    .type     = "d3d11",
+    .name     = "d3d11",
+    .reconfig = d3d11_reconfig,
+    .control  = d3d11_control,
+    .init     = d3d11_init,
+    .uninit   = d3d11_uninit,
+};
diff --git a/video/out/d3d11/hwdec_d3d11va.c b/video/out/d3d11/hwdec_d3d11va.c
new file mode 100644
index 0000000..d83fdc5
--- /dev/null
+++ b/video/out/d3d11/hwdec_d3d11va.c
@@ -0,0 +1,249 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <windows.h>
+#include <d3d11.h>
+#include <d3d11_1.h>
+
+#include "config.h"
+
+#include "common/common.h"
+#include "options/m_config.h"
+#include "osdep/windows_utils.h"
+#include "video/hwdec.h"
+#include "video/d3d.h"
+#include "video/out/d3d11/ra_d3d11.h"
+#include "video/out/gpu/hwdec.h"
+
+struct d3d11va_opts {
+    int zero_copy;
+};
+
+#define OPT_BASE_STRUCT struct d3d11va_opts
+const struct m_sub_options d3d11va_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_FLAG("d3d11va-zero-copy", zero_copy, 0),
+        {0}
+    },
+    .defaults = &(const struct d3d11va_opts) {
+        .zero_copy = 0,
+    },
+    .size = sizeof(struct d3d11va_opts)
+};
+
+struct priv_owner {
+    struct d3d11va_opts *opts;
+
+    struct mp_hwdec_ctx hwctx;
+    ID3D11Device *device;
+    ID3D11Device1 *device1;
+};
+
+struct priv {
+    // 1-copy path
+    ID3D11DeviceContext1 *ctx;
+    ID3D11Texture2D *copy_tex;
+
+    // zero-copy path
+    int num_planes;
+    const struct ra_format *fmt[4];
+};
+
+static void uninit(struct ra_hwdec *hw)
+{
+    struct priv_owner *p = hw->priv;
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+    SAFE_RELEASE(p->device);
+    SAFE_RELEASE(p->device1);
+}
+
+static int init(struct ra_hwdec *hw)
+{
+    struct priv_owner *p = hw->priv;
+    HRESULT hr;
+
+    if (!ra_is_d3d11(hw->ra))
+        return -1;
+    p->device = ra_d3d11_get_device(hw->ra);
+    if (!p->device)
+        return -1;
+
+    p->opts = mp_get_config_group(hw->priv, hw->global, &d3d11va_conf);
+
+    // D3D11VA requires Direct3D 11.1, so this should always succeed
+    hr = ID3D11Device_QueryInterface(p->device, &IID_ID3D11Device1,
+                                     (void**)&p->device1);
+    if (FAILED(hr)) {
+        MP_ERR(hw, "Failed to get D3D11.1 interface: %s\n",
+               mp_HRESULT_to_str(hr));
+        return -1;
+    }
+
+    ID3D10Multithread *multithread;
+    hr = ID3D11Device_QueryInterface(p->device, &IID_ID3D10Multithread,
+                                     (void **)&multithread);
+    if (FAILED(hr)) {
+        MP_ERR(hw, "Failed to get Multithread interface: %s\n",
+               mp_HRESULT_to_str(hr));
+        return -1;
+    }
+    ID3D10Multithread_SetMultithreadProtected(multithread, TRUE);
+    ID3D10Multithread_Release(multithread);
+
+    p->hwctx = (struct mp_hwdec_ctx){
+        .driver_name = hw->driver->name,
+        .av_device_ref = d3d11_wrap_device_ref(p->device),
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
+    return 0;
+}
+
+static void mapper_uninit(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+    for (int i = 0; i < 4; i++)
+        ra_tex_free(mapper->ra, &mapper->tex[i]);
+    SAFE_RELEASE(p->copy_tex);
+    SAFE_RELEASE(p->ctx);
+}
+
+static int mapper_init(struct ra_hwdec_mapper *mapper)
+{
+    struct priv_owner *o = mapper->owner->priv;
+    struct priv *p = mapper->priv;
+    HRESULT hr;
+
+    mapper->dst_params = mapper->src_params;
+    mapper->dst_params.imgfmt = mapper->src_params.hw_subfmt;
+    mapper->dst_params.hw_subfmt = 0;
+
+    struct ra_imgfmt_desc desc = {0};
+
+    if (!ra_get_imgfmt_desc(mapper->ra, mapper->dst_params.imgfmt, &desc))
+        return -1;
+
+    if (o->opts->zero_copy) {
+        // In the zero-copy path, we create the ra_tex objects in the map
+        // operation, so we just need to store the format of each plane
+        p->num_planes = desc.num_planes;
+        for (int i = 0; i < desc.num_planes; i++)
+            p->fmt[i] = desc.planes[i];
+    } else {
+        struct mp_image layout = {0};
+        mp_image_set_params(&layout, &mapper->dst_params);
+
+        DXGI_FORMAT copy_fmt;
+        switch (mapper->dst_params.imgfmt) {
+        case IMGFMT_NV12: copy_fmt = DXGI_FORMAT_NV12; break;
+        case IMGFMT_P010: copy_fmt = DXGI_FORMAT_P010; break;
+        default: return -1;
+        }
+
+        D3D11_TEXTURE2D_DESC copy_desc = {
+            .Width = mapper->dst_params.w,
+            .Height = mapper->dst_params.h,
+            .MipLevels = 1,
+            .ArraySize = 1,
+            .SampleDesc.Count = 1,
+            .Format = copy_fmt,
+            .BindFlags = D3D11_BIND_SHADER_RESOURCE,
+        };
+        hr = ID3D11Device_CreateTexture2D(o->device, &copy_desc, NULL,
+                                          &p->copy_tex);
+        if (FAILED(hr)) {
+            MP_FATAL(mapper, "Could not create shader resource texture\n");
+            return -1;
+        }
+
+        for (int i = 0; i < desc.num_planes; i++) {
+            mapper->tex[i] = ra_d3d11_wrap_tex_video(mapper->ra, p->copy_tex,
+                mp_image_plane_w(&layout, i), mp_image_plane_h(&layout, i), 0,
+                desc.planes[i]);
+            if (!mapper->tex[i]) {
+                MP_FATAL(mapper, "Could not create RA texture view\n");
+                return -1;
+            }
+        }
+
+        // A ref to the immediate context is needed for CopySubresourceRegion
+        ID3D11Device1_GetImmediateContext1(o->device1, &p->ctx);
+    }
+
+    return 0;
+}
+
+static int mapper_map(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+    ID3D11Texture2D *tex = (void *)mapper->src->planes[0];
+    int subresource = (intptr_t)mapper->src->planes[1];
+
+    if (p->copy_tex) {
+        ID3D11DeviceContext1_CopySubresourceRegion1(p->ctx,
+            (ID3D11Resource *)p->copy_tex, 0, 0, 0, 0,
+            (ID3D11Resource *)tex, subresource, (&(D3D11_BOX) {
+                .left = 0,
+                .top = 0,
+                .front = 0,
+                .right = mapper->dst_params.w,
+                .bottom = mapper->dst_params.h,
+                .back = 1,
+            }), D3D11_COPY_DISCARD);
+    } else {
+        D3D11_TEXTURE2D_DESC desc2d;
+        ID3D11Texture2D_GetDesc(tex, &desc2d);
+
+        for (int i = 0; i < p->num_planes; i++) {
+            // The video decode texture may include padding, so the size of the
+            // ra_tex needs to be determined by the actual size of the Tex2D
+            bool chroma = i >= 1;
+            int w = desc2d.Width / (chroma ? 2 : 1);
+            int h = desc2d.Height / (chroma ? 2 : 1);
+
+            mapper->tex[i] = ra_d3d11_wrap_tex_video(mapper->ra, tex,
+                w, h, subresource, p->fmt[i]);
+            if (!mapper->tex[i])
+                return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void mapper_unmap(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+    if (p->copy_tex)
+        return;
+    for (int i = 0; i < 4; i++)
+        ra_tex_free(mapper->ra, &mapper->tex[i]);
+}
+
+const struct ra_hwdec_driver ra_hwdec_d3d11va = {
+    .name = "d3d11va",
+    .priv_size = sizeof(struct priv_owner),
+    .imgfmts = {IMGFMT_D3D11VA, IMGFMT_D3D11NV12, 0},
+    .init = init,
+    .uninit = uninit,
+    .mapper = &(const struct ra_hwdec_mapper_driver){
+        .priv_size = sizeof(struct priv),
+        .init = mapper_init,
+        .uninit = mapper_uninit,
+        .map = mapper_map,
+        .unmap = mapper_unmap,
+    },
+};
diff --git a/video/out/d3d11/ra_d3d11.c b/video/out/d3d11/ra_d3d11.c
new file mode 100644
index 0000000..63dc5b9
--- /dev/null
+++ b/video/out/d3d11/ra_d3d11.c
@@ -0,0 +1,2371 @@
+#include <windows.h>
+#include <versionhelpers.h>
+#include <d3d11_1.h>
+#include <d3d11sdklayers.h>
+#include <dxgi1_2.h>
+#include <d3dcompiler.h>
+#include <crossc.h>
+
+#include "common/msg.h"
+#include "osdep/io.h"
+#include "osdep/subprocess.h"
+#include "osdep/timer.h"
+#include "osdep/windows_utils.h"
+#include "video/out/gpu/spirv.h"
+#include "video/out/gpu/utils.h"
+
+#include "ra_d3d11.h"
+
+#ifndef D3D11_1_UAV_SLOT_COUNT
+#define D3D11_1_UAV_SLOT_COUNT (64)
+#endif
+
+struct dll_version {
+    uint16_t major;
+    uint16_t minor;
+    uint16_t build;
+    uint16_t revision;
+};
+
+struct ra_d3d11 {
+    struct spirv_compiler *spirv;
+
+    ID3D11Device *dev;
+    ID3D11Device1 *dev1;
+    ID3D11DeviceContext *ctx;
+    ID3D11DeviceContext1 *ctx1;
+    pD3DCompile D3DCompile;
+
+    struct dll_version d3d_compiler_ver;
+
+    // Debug interfaces (--gpu-debug)
+    ID3D11Debug *debug;
+    ID3D11InfoQueue *iqueue;
+
+    // Device capabilities
+    D3D_FEATURE_LEVEL fl;
+    bool has_clear_view;
+    bool has_timestamp_queries;
+    int max_uavs;
+
+    // Streaming dynamic vertex buffer, which is used for all renderpasses
+    ID3D11Buffer *vbuf;
+    size_t vbuf_size;
+    size_t vbuf_used;
+
+    // clear() renderpass resources (only used when has_clear_view is false)
+    ID3D11PixelShader *clear_ps;
+    ID3D11VertexShader *clear_vs;
+    ID3D11InputLayout *clear_layout;
+    ID3D11Buffer *clear_vbuf;
+    ID3D11Buffer *clear_cbuf;
+
+    // blit() renderpass resources
+    ID3D11PixelShader *blit_float_ps;
+    ID3D11VertexShader *blit_vs;
+    ID3D11InputLayout *blit_layout;
+    ID3D11Buffer *blit_vbuf;
+    ID3D11SamplerState *blit_sampler;
+};
+
+struct d3d_tex {
+    // res mirrors one of tex1d, tex2d or tex3d for convenience. It does not
+    // hold an additional reference to the texture object.
+    ID3D11Resource *res;
+
+    ID3D11Texture1D *tex1d;
+    ID3D11Texture2D *tex2d;
+    ID3D11Texture3D *tex3d;
+    int array_slice;
+
+    ID3D11ShaderResourceView *srv;
+    ID3D11RenderTargetView *rtv;
+    ID3D11UnorderedAccessView *uav;
+    ID3D11SamplerState *sampler;
+};
+
+struct d3d_buf {
+    ID3D11Buffer *buf;
+    ID3D11Buffer *staging;
+    ID3D11UnorderedAccessView *uav;
+    void *data; // Data for mapped staging texture
+};
+
+struct d3d_rpass {
+    ID3D11PixelShader *ps;
+    ID3D11VertexShader *vs;
+    ID3D11ComputeShader *cs;
+    ID3D11InputLayout *layout;
+    ID3D11BlendState *bstate;
+};
+
+struct d3d_timer {
+    ID3D11Query *ts_start;
+    ID3D11Query *ts_end;
+    ID3D11Query *disjoint;
+    uint64_t result; // Latches the result from the previous use of the timer
+};
+
+struct d3d_fmt {
+    const char *name;
+    int components;
+    int bytes;
+    int bits[4];
+    DXGI_FORMAT fmt;
+    enum ra_ctype ctype;
+    bool unordered;
+};
+
+static const char clear_vs[] = "\
+float4 main(float2 pos : POSITION) : SV_Position\n\
+{\n\
+    return float4(pos, 0.0, 1.0);\n\
+}\n\
+";
+
+static const char clear_ps[] = "\
+cbuffer ps_cbuf : register(b0) {\n\
+    float4 color : packoffset(c0);\n\
+}\n\
+\n\
+float4 main(float4 pos : SV_Position) : SV_Target\n\
+{\n\
+    return color;\n\
+}\n\
+";
+
+struct blit_vert {
+    float x, y, u, v;
+};
+
+static const char blit_vs[] = "\
+void main(float2 pos : POSITION, float2 coord : TEXCOORD0,\n\
+          out float4 out_pos : SV_Position, out float2 out_coord : TEXCOORD0)\n\
+{\n\
+    out_pos = float4(pos, 0.0, 1.0);\n\
+    out_coord = coord;\n\
+}\n\
+";
+
+static const char blit_float_ps[] = "\
+Texture2D<float4> tex : register(t0);\n\
+SamplerState samp : register(s0);\n\
+\n\
+float4 main(float4 pos : SV_Position, float2 coord : TEXCOORD0) : SV_Target\n\
+{\n\
+    return tex.Sample(samp, coord);\n\
+}\n\
+";
+
+#define DXFMT(f, t) .fmt = DXGI_FORMAT_##f##_##t, .ctype = RA_CTYPE_##t
+static struct d3d_fmt formats[] = {
+    { "r8",       1,  1, { 8},             DXFMT(R8, UNORM)           },
+    { "rg8",      2,  2, { 8,  8},         DXFMT(R8G8, UNORM)         },
+    { "rgba8",    4,  4, { 8,  8,  8,  8}, DXFMT(R8G8B8A8, UNORM)     },
+    { "r16",      1,  2, {16},             DXFMT(R16, UNORM)          },
+    { "rg16",     2,  4, {16, 16},         DXFMT(R16G16, UNORM)       },
+    { "rgba16",   4,  8, {16, 16, 16, 16}, DXFMT(R16G16B16A16, UNORM) },
+
+    { "r32ui",    1,  4, {32},             DXFMT(R32, UINT)           },
+    { "rg32ui",   2,  8, {32, 32},         DXFMT(R32G32, UINT)        },
+    { "rgb32ui",  3, 12, {32, 32, 32},     DXFMT(R32G32B32, UINT)     },
+    { "rgba32ui", 4, 16, {32, 32, 32, 32}, DXFMT(R32G32B32A32, UINT)  },
+
+    { "r16hf",    1,  2, {16},             DXFMT(R16, FLOAT)          },
+    { "rg16hf",   2,  4, {16, 16},         DXFMT(R16G16, FLOAT)       },
+    { "rgba16hf", 4,  8, {16, 16, 16, 16}, DXFMT(R16G16B16A16, FLOAT) },
+    { "r32f",     1,  4, {32},             DXFMT(R32, FLOAT)          },
+    { "rg32f",    2,  8, {32, 32},         DXFMT(R32G32, FLOAT)       },
+    { "rgb32f",   3, 12, {32, 32, 32},     DXFMT(R32G32B32, FLOAT)    },
+    { "rgba32f",  4, 16, {32, 32, 32, 32}, DXFMT(R32G32B32A32, FLOAT) },
+
+    { "rgb10_a2", 4,  4, {10, 10, 10,  2}, DXFMT(R10G10B10A2, UNORM)  },
+    { "bgra8",    4,  4, { 8,  8,  8,  8}, DXFMT(B8G8R8A8, UNORM), .unordered = true },
+};
+
+static bool dll_version_equal(struct dll_version a, struct dll_version b)
+{
+    return a.major == b.major &&
+           a.minor == b.minor &&
+           a.build == b.build &&
+           a.revision == b.revision;
+}
+
+static DXGI_FORMAT fmt_to_dxgi(const struct ra_format *fmt)
+{
+    struct d3d_fmt *d3d = fmt->priv;
+    return d3d->fmt;
+}
+
+static void setup_formats(struct ra *ra)
+{
+    // All formats must be usable as a 2D texture
+    static const UINT sup_basic = D3D11_FORMAT_SUPPORT_TEXTURE2D;
+    // SHADER_SAMPLE indicates support for linear sampling, point always works
+    static const UINT sup_filter = D3D11_FORMAT_SUPPORT_SHADER_SAMPLE;
+    // RA requires renderable surfaces to be blendable as well
+    static const UINT sup_render = D3D11_FORMAT_SUPPORT_RENDER_TARGET |
+                                   D3D11_FORMAT_SUPPORT_BLENDABLE;
+
+    struct ra_d3d11 *p = ra->priv;
+    HRESULT hr;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(formats); i++) {
+        struct d3d_fmt *d3dfmt = &formats[i];
+        UINT support = 0;
+        hr = ID3D11Device_CheckFormatSupport(p->dev, d3dfmt->fmt, &support);
+        if (FAILED(hr))
+            continue;
+        if ((support & sup_basic) != sup_basic)
+            continue;
+
+        struct ra_format *fmt = talloc_zero(ra, struct ra_format);
+        *fmt = (struct ra_format) {
+            .name           = d3dfmt->name,
+            .priv           = d3dfmt,
+            .ctype          = d3dfmt->ctype,
+            .ordered        = !d3dfmt->unordered,
+            .num_components = d3dfmt->components,
+            .pixel_size     = d3dfmt->bytes,
+            .linear_filter  = (support & sup_filter) == sup_filter,
+            .renderable     = (support & sup_render) == sup_render,
+        };
+
+        if (support & D3D11_FORMAT_SUPPORT_TEXTURE1D)
+            ra->caps |= RA_CAP_TEX_1D;
+
+        for (int j = 0; j < d3dfmt->components; j++)
+            fmt->component_size[j] = fmt->component_depth[j] = d3dfmt->bits[j];
+
+        fmt->glsl_format = ra_fmt_glsl_format(fmt);
+
+        MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt);
+    }
+}
+
+static bool tex_init(struct ra *ra, struct ra_tex *tex)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_tex *tex_p = tex->priv;
+    struct ra_tex_params *params = &tex->params;
+    HRESULT hr;
+
+    // A SRV is required for renderpasses and blitting, since blitting can use
+    // a renderpass internally
+    if (params->render_src || params->blit_src) {
+        // Always specify the SRV format for simplicity. This will match the
+        // texture format for textures created with tex_create, but it can be
+        // different for wrapped planar video textures.
+        D3D11_SHADER_RESOURCE_VIEW_DESC srvdesc = {
+            .Format = fmt_to_dxgi(params->format),
+        };
+        switch (params->dimensions) {
+        case 1:
+            if (tex_p->array_slice >= 0) {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1DARRAY;
+                srvdesc.Texture1DArray.MipLevels = 1;
+                srvdesc.Texture1DArray.FirstArraySlice = tex_p->array_slice;
+                srvdesc.Texture1DArray.ArraySize = 1;
+            } else {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE1D;
+                srvdesc.Texture1D.MipLevels = 1;
+            }
+            break;
+        case 2:
+            if (tex_p->array_slice >= 0) {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2DARRAY;
+                srvdesc.Texture2DArray.MipLevels = 1;
+                srvdesc.Texture2DArray.FirstArraySlice = tex_p->array_slice;
+                srvdesc.Texture2DArray.ArraySize = 1;
+            } else {
+                srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE2D;
+                srvdesc.Texture2D.MipLevels = 1;
+            }
+            break;
+        case 3:
+            // D3D11 does not have Texture3D arrays
+            srvdesc.ViewDimension = D3D11_SRV_DIMENSION_TEXTURE3D;
+            srvdesc.Texture3D.MipLevels = 1;
+            break;
+        }
+        hr = ID3D11Device_CreateShaderResourceView(p->dev, tex_p->res, &srvdesc,
+                                                   &tex_p->srv);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create SRV: %s\n", mp_HRESULT_to_str(hr));
+            goto error;
+        }
+    }
+
+    // Samplers are required for renderpasses, but not blitting, since the blit
+    // code uses its own point sampler
+    if (params->render_src) {
+        D3D11_SAMPLER_DESC sdesc = {
+            .AddressU = D3D11_TEXTURE_ADDRESS_CLAMP,
+            .AddressV = D3D11_TEXTURE_ADDRESS_CLAMP,
+            .AddressW = D3D11_TEXTURE_ADDRESS_CLAMP,
+            .ComparisonFunc = D3D11_COMPARISON_NEVER,
+            .MinLOD = 0,
+            .MaxLOD = D3D11_FLOAT32_MAX,
+            .MaxAnisotropy = 1,
+        };
+        if (params->src_linear)
+            sdesc.Filter = D3D11_FILTER_MIN_MAG_MIP_LINEAR;
+        if (params->src_repeat) {
+            sdesc.AddressU = sdesc.AddressV = sdesc.AddressW =
+                D3D11_TEXTURE_ADDRESS_WRAP;
+        }
+        // The runtime pools sampler state objects internally, so we don't have
+        // to worry about resource usage when creating one for every ra_tex
+        hr = ID3D11Device_CreateSamplerState(p->dev, &sdesc, &tex_p->sampler);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create sampler: %s\n", mp_HRESULT_to_str(hr));
+            goto error;
+        }
+    }
+
+    // Like SRVs, an RTV is required for renderpass output and blitting
+    if (params->render_dst || params->blit_dst) {
+        hr = ID3D11Device_CreateRenderTargetView(p->dev, tex_p->res, NULL,
+                                                 &tex_p->rtv);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create RTV: %s\n", mp_HRESULT_to_str(hr));
+            goto error;
+        }
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0 && params->storage_dst) {
+        hr = ID3D11Device_CreateUnorderedAccessView(p->dev, tex_p->res, NULL,
+                                                    &tex_p->uav);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create UAV: %s\n", mp_HRESULT_to_str(hr));
+            goto error;
+        }
+    }
+
+    return true;
+error:
+    return false;
+}
+
+static void tex_destroy(struct ra *ra, struct ra_tex *tex)
+{
+    if (!tex)
+        return;
+    struct d3d_tex *tex_p = tex->priv;
+
+    SAFE_RELEASE(tex_p->srv);
+    SAFE_RELEASE(tex_p->rtv);
+    SAFE_RELEASE(tex_p->uav);
+    SAFE_RELEASE(tex_p->sampler);
+    SAFE_RELEASE(tex_p->res);
+    talloc_free(tex);
+}
+
+static struct ra_tex *tex_create(struct ra *ra,
+                                 const struct ra_tex_params *params)
+{
+    struct ra_d3d11 *p = ra->priv;
+    HRESULT hr;
+
+    struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+
+    struct d3d_tex *tex_p = tex->priv = talloc_zero(tex, struct d3d_tex);
+    DXGI_FORMAT fmt = fmt_to_dxgi(params->format);
+
+    D3D11_SUBRESOURCE_DATA *pdata = NULL;
+    if (params->initial_data) {
+        pdata = &(D3D11_SUBRESOURCE_DATA) {
+            .pSysMem = params->initial_data,
+            .SysMemPitch = params->w * params->format->pixel_size,
+        };
+        if (params->dimensions >= 3)
+            pdata->SysMemSlicePitch = pdata->SysMemPitch * params->h;
+    }
+
+    D3D11_USAGE usage = D3D11_USAGE_DEFAULT;
+    D3D11_BIND_FLAG bind_flags = 0;
+
+    if (params->render_src || params->blit_src)
+        bind_flags |= D3D11_BIND_SHADER_RESOURCE;
+    if (params->render_dst || params->blit_dst)
+        bind_flags |= D3D11_BIND_RENDER_TARGET;
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0 && params->storage_dst)
+        bind_flags |= D3D11_BIND_UNORDERED_ACCESS;
+
+    // Apparently IMMUTABLE textures are efficient, so try to infer whether we
+    // can use one
+    if (params->initial_data && !params->render_dst && !params->storage_dst &&
+        !params->blit_dst && !params->host_mutable)
+        usage = D3D11_USAGE_IMMUTABLE;
+
+    switch (params->dimensions) {
+    case 1:;
+        D3D11_TEXTURE1D_DESC desc1d = {
+            .Width = params->w,
+            .MipLevels = 1,
+            .ArraySize = 1,
+            .Format = fmt,
+            .Usage = usage,
+            .BindFlags = bind_flags,
+        };
+        hr = ID3D11Device_CreateTexture1D(p->dev, &desc1d, pdata, &tex_p->tex1d);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create Texture1D: %s\n",
+                   mp_HRESULT_to_str(hr));
+            goto error;
+        }
+        tex_p->res = (ID3D11Resource *)tex_p->tex1d;
+        break;
+    case 2:;
+        D3D11_TEXTURE2D_DESC desc2d = {
+            .Width = params->w,
+            .Height = params->h,
+            .MipLevels = 1,
+            .ArraySize = 1,
+            .SampleDesc.Count = 1,
+            .Format = fmt,
+            .Usage = usage,
+            .BindFlags = bind_flags,
+        };
+        hr = ID3D11Device_CreateTexture2D(p->dev, &desc2d, pdata, &tex_p->tex2d);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create Texture2D: %s\n",
+                   mp_HRESULT_to_str(hr));
+            goto error;
+        }
+        tex_p->res = (ID3D11Resource *)tex_p->tex2d;
+        break;
+    case 3:;
+        D3D11_TEXTURE3D_DESC desc3d = {
+            .Width = params->w,
+            .Height = params->h,
+            .Depth = params->d,
+            .MipLevels = 1,
+            .Format = fmt,
+            .Usage = usage,
+            .BindFlags = bind_flags,
+        };
+        hr = ID3D11Device_CreateTexture3D(p->dev, &desc3d, pdata, &tex_p->tex3d);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create Texture3D: %s\n",
+                   mp_HRESULT_to_str(hr));
+            goto error;
+        }
+        tex_p->res = (ID3D11Resource *)tex_p->tex3d;
+        break;
+    default:
+        abort();
+    }
+
+    tex_p->array_slice = -1;
+
+    if (!tex_init(ra, tex))
+        goto error;
+
+    return tex;
+
+error:
+    tex_destroy(ra, tex);
+    return NULL;
+}
+
+struct ra_tex *ra_d3d11_wrap_tex(struct ra *ra, ID3D11Resource *res)
+{
+    HRESULT hr;
+
+    struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
+    struct ra_tex_params *params = &tex->params;
+    struct d3d_tex *tex_p = tex->priv = talloc_zero(tex, struct d3d_tex);
+
+    DXGI_FORMAT fmt = DXGI_FORMAT_UNKNOWN;
+    D3D11_USAGE usage = D3D11_USAGE_DEFAULT;
+    D3D11_BIND_FLAG bind_flags = 0;
+
+    D3D11_RESOURCE_DIMENSION type;
+    ID3D11Resource_GetType(res, &type);
+    switch (type) {
+    case D3D11_RESOURCE_DIMENSION_TEXTURE2D:
+        hr = ID3D11Resource_QueryInterface(res, &IID_ID3D11Texture2D,
+                                           (void**)&tex_p->tex2d);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Resource is not a ID3D11Texture2D\n");
+            goto error;
+        }
+        tex_p->res = (ID3D11Resource *)tex_p->tex2d;
+
+        D3D11_TEXTURE2D_DESC desc2d;
+        ID3D11Texture2D_GetDesc(tex_p->tex2d, &desc2d);
+        if (desc2d.MipLevels != 1) {
+            MP_ERR(ra, "Mipmapped textures not supported for wrapping\n");
+            goto error;
+        }
+        if (desc2d.ArraySize != 1) {
+            MP_ERR(ra, "Texture arrays not supported for wrapping\n");
+            goto error;
+        }
+        if (desc2d.SampleDesc.Count != 1) {
+            MP_ERR(ra, "Multisampled textures not supported for wrapping\n");
+            goto error;
+        }
+
+        params->dimensions = 2;
+        params->w = desc2d.Width;
+        params->h = desc2d.Height;
+        params->d = 1;
+        usage = desc2d.Usage;
+        bind_flags = desc2d.BindFlags;
+        fmt = desc2d.Format;
+        break;
+    default:
+        // We could wrap Texture1D/3D as well, but keep it simple, since this
+        // function is only used for swapchain backbuffers at the moment
+        MP_ERR(ra, "Resource is not suitable to wrap\n");
+        goto error;
+    }
+
+    for (int i = 0; i < ra->num_formats; i++) {
+        DXGI_FORMAT target_fmt = fmt_to_dxgi(ra->formats[i]);
+        if (fmt == target_fmt) {
+            params->format = ra->formats[i];
+            break;
+        }
+    }
+    if (!params->format) {
+        MP_ERR(ra, "Could not find a suitable RA format for wrapped resource\n");
+        goto error;
+    }
+
+    if (bind_flags & D3D11_BIND_SHADER_RESOURCE)
+        params->render_src = params->blit_src = true;
+    if (bind_flags & D3D11_BIND_RENDER_TARGET)
+        params->render_dst = params->blit_dst = true;
+    if (bind_flags & D3D11_BIND_UNORDERED_ACCESS)
+        params->storage_dst = true;
+
+    if (usage != D3D11_USAGE_DEFAULT) {
+        MP_ERR(ra, "Resource is not D3D11_USAGE_DEFAULT\n");
+        goto error;
+    }
+
+    tex_p->array_slice = -1;
+
+    if (!tex_init(ra, tex))
+        goto error;
+
+    return tex;
+error:
+    tex_destroy(ra, tex);
+    return NULL;
+}
+
+struct ra_tex *ra_d3d11_wrap_tex_video(struct ra *ra, ID3D11Texture2D *res,
+                                       int w, int h, int array_slice,
+                                       const struct ra_format *fmt)
+{
+    struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
+    struct ra_tex_params *params = &tex->params;
+    struct d3d_tex *tex_p = tex->priv = talloc_zero(tex, struct d3d_tex);
+
+    tex_p->tex2d = res;
+    tex_p->res = (ID3D11Resource *)tex_p->tex2d;
+    ID3D11Texture2D_AddRef(res);
+
+    D3D11_TEXTURE2D_DESC desc2d;
+    ID3D11Texture2D_GetDesc(tex_p->tex2d, &desc2d);
+    if (!(desc2d.BindFlags & D3D11_BIND_SHADER_RESOURCE)) {
+        MP_ERR(ra, "Video resource is not bindable\n");
+        goto error;
+    }
+
+    params->dimensions = 2;
+    params->w = w;
+    params->h = h;
+    params->d = 1;
+    params->render_src = true;
+    params->src_linear = true;
+    // fmt can be different to the texture format for planar video textures
+    params->format = fmt;
+
+    if (desc2d.ArraySize > 1) {
+        tex_p->array_slice = array_slice;
+    } else {
+        tex_p->array_slice = -1;
+    }
+
+    if (!tex_init(ra, tex))
+        goto error;
+
+    return tex;
+error:
+    tex_destroy(ra, tex);
+    return NULL;
+}
+
+static bool tex_upload(struct ra *ra, const struct ra_tex_upload_params *params)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct ra_tex *tex = params->tex;
+    struct d3d_tex *tex_p = tex->priv;
+
+    if (!params->src) {
+        MP_ERR(ra, "Pixel buffers are not supported\n");
+        return false;
+    }
+
+    const char *src = params->src;
+    ptrdiff_t stride = tex->params.dimensions >= 2 ? tex->params.w : 0;
+    ptrdiff_t pitch = tex->params.dimensions >= 3 ? stride * tex->params.h : 0;
+    bool invalidate = true;
+    D3D11_BOX *rc = NULL;
+
+    if (tex->params.dimensions == 2) {
+        stride = params->stride;
+
+        if (params->rc && (params->rc->x0 != 0 || params->rc->y0 != 0 ||
+            params->rc->x1 != tex->params.w || params->rc->y1 != tex->params.h))
+        {
+            rc = &(D3D11_BOX) {
+                .left = params->rc->x0,
+                .top = params->rc->y0,
+                .front = 0,
+                .right = params->rc->x1,
+                .bottom = params->rc->y1,
+                .back = 1,
+            };
+            invalidate = params->invalidate;
+        }
+    }
+
+    int subresource = tex_p->array_slice >= 0 ? tex_p->array_slice : 0;
+    if (p->ctx1) {
+        ID3D11DeviceContext1_UpdateSubresource1(p->ctx1, tex_p->res,
+            subresource, rc, src, stride, pitch,
+            invalidate ? D3D11_COPY_DISCARD : 0);
+    } else {
+        ID3D11DeviceContext_UpdateSubresource(p->ctx, tex_p->res, subresource,
+            rc, src, stride, pitch);
+    }
+
+    return true;
+}
+
+static void buf_destroy(struct ra *ra, struct ra_buf *buf)
+{
+    if (!buf)
+        return;
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_buf *buf_p = buf->priv;
+
+    if (buf_p->data)
+        ID3D11DeviceContext_Unmap(p->ctx, (ID3D11Resource *)buf_p->staging, 0);
+    SAFE_RELEASE(buf_p->buf);
+    SAFE_RELEASE(buf_p->staging);
+    SAFE_RELEASE(buf_p->uav);
+    talloc_free(buf);
+}
+
+static struct ra_buf *buf_create(struct ra *ra,
+                                 const struct ra_buf_params *params)
+{
+    // D3D11 does not support permanent mapping or pixel buffers
+    if (params->host_mapped || params->type == RA_BUF_TYPE_TEX_UPLOAD)
+        return NULL;
+
+    struct ra_d3d11 *p = ra->priv;
+    HRESULT hr;
+
+    struct ra_buf *buf = talloc_zero(NULL, struct ra_buf);
+    buf->params = *params;
+    buf->params.initial_data = NULL;
+
+    struct d3d_buf *buf_p = buf->priv = talloc_zero(buf, struct d3d_buf);
+
+    D3D11_SUBRESOURCE_DATA *pdata = NULL;
+    if (params->initial_data)
+        pdata = &(D3D11_SUBRESOURCE_DATA) { .pSysMem = params->initial_data };
+
+    D3D11_BUFFER_DESC desc = { .ByteWidth = params->size };
+    switch (params->type) {
+    case RA_BUF_TYPE_SHADER_STORAGE:
+        desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS;
+        desc.ByteWidth = MP_ALIGN_UP(desc.ByteWidth, sizeof(float));
+        desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS;
+        break;
+    case RA_BUF_TYPE_UNIFORM:
+        desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+        desc.ByteWidth = MP_ALIGN_UP(desc.ByteWidth, sizeof(float[4]));
+        break;
+    }
+
+    hr = ID3D11Device_CreateBuffer(p->dev, &desc, pdata, &buf_p->buf);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create buffer: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    if (params->host_mutable) {
+        // D3D11 doesn't allow constant buffer updates that aren't aligned to a
+        // full constant boundary (vec4,) and some drivers don't allow partial
+        // constant buffer updates at all, but the RA consumer is allowed to
+        // partially update an ra_buf. The best way to handle partial updates
+        // without causing a pipeline stall is probably to keep a copy of the
+        // data in a staging buffer.
+
+        desc.Usage = D3D11_USAGE_STAGING;
+        desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+        desc.BindFlags = 0;
+        hr = ID3D11Device_CreateBuffer(p->dev, &desc, NULL, &buf_p->staging);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create staging buffer: %s\n",
+                   mp_HRESULT_to_str(hr));
+            goto error;
+        }
+    }
+
+    if (params->type == RA_BUF_TYPE_SHADER_STORAGE) {
+        D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = {
+            .Format = DXGI_FORMAT_R32_TYPELESS,
+            .ViewDimension = D3D11_UAV_DIMENSION_BUFFER,
+            .Buffer = {
+                .NumElements = desc.ByteWidth / sizeof(float),
+                .Flags = D3D11_BUFFER_UAV_FLAG_RAW,
+            },
+        };
+        hr = ID3D11Device_CreateUnorderedAccessView(p->dev,
+            (ID3D11Resource *)buf_p->buf, &udesc, &buf_p->uav);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create UAV: %s\n", mp_HRESULT_to_str(hr));
+            goto error;
+        }
+    }
+
+    return buf;
+error:
+    buf_destroy(ra, buf);
+    return NULL;
+}
+
+static void buf_resolve(struct ra *ra, struct ra_buf *buf)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_buf *buf_p = buf->priv;
+
+    assert(buf->params.host_mutable);
+    if (!buf_p->data)
+        return;
+
+    ID3D11DeviceContext_Unmap(p->ctx, (ID3D11Resource *)buf_p->staging, 0);
+    buf_p->data = NULL;
+
+    // Synchronize the GPU buffer with the staging buffer
+    ID3D11DeviceContext_CopyResource(p->ctx, (ID3D11Resource *)buf_p->buf,
+                                     (ID3D11Resource *)buf_p->staging);
+}
+
+static void buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
+                       const void *data, size_t size)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_buf *buf_p = buf->priv;
+    HRESULT hr;
+
+    if (!buf_p->data) {
+        // If this is the first update after the buffer was created or after it
+        // has been used in a renderpass, it will be unmapped, so map it
+        D3D11_MAPPED_SUBRESOURCE map = {0};
+        hr = ID3D11DeviceContext_Map(p->ctx, (ID3D11Resource *)buf_p->staging,
+                                     0, D3D11_MAP_WRITE, 0, &map);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to map resource\n");
+            return;
+        }
+        buf_p->data = map.pData;
+    }
+
+    char *cdata = buf_p->data;
+    memcpy(cdata + offset, data, size);
+}
+
+static const char *get_shader_target(struct ra *ra, enum glsl_shader type)
+{
+    struct ra_d3d11 *p = ra->priv;
+    switch (p->fl) {
+    default:
+        switch (type) {
+        case GLSL_SHADER_VERTEX:   return "vs_5_0";
+        case GLSL_SHADER_FRAGMENT: return "ps_5_0";
+        case GLSL_SHADER_COMPUTE:  return "cs_5_0";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_10_1:
+        switch (type) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_1";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_1";
+        case GLSL_SHADER_COMPUTE:  return "cs_4_1";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_10_0:
+        switch (type) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_0";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_0";
+        case GLSL_SHADER_COMPUTE:  return "cs_4_0";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_9_3:
+        switch (type) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_0_level_9_3";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_3";
+        }
+        break;
+    case D3D_FEATURE_LEVEL_9_2:
+    case D3D_FEATURE_LEVEL_9_1:
+        switch (type) {
+        case GLSL_SHADER_VERTEX:   return "vs_4_0_level_9_1";
+        case GLSL_SHADER_FRAGMENT: return "ps_4_0_level_9_1";
+        }
+        break;
+    }
+    return NULL;
+}
+
+static const char *shader_type_name(enum glsl_shader type)
+{
+    switch (type) {
+    case GLSL_SHADER_VERTEX:   return "vertex";
+    case GLSL_SHADER_FRAGMENT: return "fragment";
+    case GLSL_SHADER_COMPUTE:  return "compute";
+    default:                   return "unknown";
+    }
+}
+
+static bool setup_clear_rpass(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    ID3DBlob *vs_blob = NULL;
+    ID3DBlob *ps_blob = NULL;
+    HRESULT hr;
+
+    hr = p->D3DCompile(clear_vs, sizeof(clear_vs), NULL, NULL, NULL, "main",
+        get_shader_target(ra, GLSL_SHADER_VERTEX),
+        D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &vs_blob, NULL);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to compile clear() vertex shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = ID3D11Device_CreateVertexShader(p->dev,
+        ID3D10Blob_GetBufferPointer(vs_blob), ID3D10Blob_GetBufferSize(vs_blob),
+        NULL, &p->clear_vs);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create clear() vertex shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = p->D3DCompile(clear_ps, sizeof(clear_ps), NULL, NULL, NULL, "main",
+        get_shader_target(ra, GLSL_SHADER_FRAGMENT),
+        D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &ps_blob, NULL);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to compile clear() pixel shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = ID3D11Device_CreatePixelShader(p->dev,
+        ID3D10Blob_GetBufferPointer(ps_blob), ID3D10Blob_GetBufferSize(ps_blob),
+        NULL, &p->clear_ps);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create clear() pixel shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    D3D11_INPUT_ELEMENT_DESC in_descs[] = {
+        { "POSITION", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0 },
+    };
+    hr = ID3D11Device_CreateInputLayout(p->dev, in_descs,
+        MP_ARRAY_SIZE(in_descs), ID3D10Blob_GetBufferPointer(vs_blob),
+        ID3D10Blob_GetBufferSize(vs_blob), &p->clear_layout);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create clear() IA layout: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    // clear() always draws to a quad covering the whole viewport
+    static const float verts[] = {
+        -1, -1,
+         1, -1,
+         1,  1,
+        -1,  1,
+        -1, -1,
+         1,  1,
+    };
+    D3D11_BUFFER_DESC vdesc = {
+        .ByteWidth = sizeof(verts),
+        .Usage = D3D11_USAGE_IMMUTABLE,
+        .BindFlags = D3D11_BIND_VERTEX_BUFFER,
+    };
+    D3D11_SUBRESOURCE_DATA vdata = {
+        .pSysMem = verts,
+    };
+    hr = ID3D11Device_CreateBuffer(p->dev, &vdesc, &vdata, &p->clear_vbuf);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create clear() vertex buffer: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    D3D11_BUFFER_DESC cdesc = {
+        .ByteWidth = sizeof(float[4]),
+        .BindFlags = D3D11_BIND_CONSTANT_BUFFER,
+    };
+    hr = ID3D11Device_CreateBuffer(p->dev, &cdesc, NULL, &p->clear_cbuf);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create clear() constant buffer: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(ps_blob);
+    return true;
+error:
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(ps_blob);
+    return false;
+}
+
+static void clear_rpass(struct ra *ra, struct ra_tex *tex, float color[4],
+                        struct mp_rect *rc)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_tex *tex_p = tex->priv;
+    struct ra_tex_params *params = &tex->params;
+
+    ID3D11DeviceContext_UpdateSubresource(p->ctx,
+        (ID3D11Resource *)p->clear_cbuf, 0, NULL, color, 0, 0);
+
+    ID3D11DeviceContext_IASetInputLayout(p->ctx, p->clear_layout);
+    ID3D11DeviceContext_IASetVertexBuffers(p->ctx, 0, 1, &p->clear_vbuf,
+        &(UINT) { sizeof(float[2]) }, &(UINT) { 0 });
+    ID3D11DeviceContext_IASetPrimitiveTopology(p->ctx,
+        D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+
+    ID3D11DeviceContext_VSSetShader(p->ctx, p->clear_vs, NULL, 0);
+
+    ID3D11DeviceContext_RSSetViewports(p->ctx, 1, (&(D3D11_VIEWPORT) {
+        .Width = params->w,
+        .Height = params->h,
+        .MinDepth = 0,
+        .MaxDepth = 1,
+    }));
+    ID3D11DeviceContext_RSSetScissorRects(p->ctx, 1, (&(D3D11_RECT) {
+        .left = rc->x0,
+        .top = rc->y0,
+        .right = rc->x1,
+        .bottom = rc->y1,
+    }));
+    ID3D11DeviceContext_PSSetShader(p->ctx, p->clear_ps, NULL, 0);
+    ID3D11DeviceContext_PSSetConstantBuffers(p->ctx, 0, 1, &p->clear_cbuf);
+
+    ID3D11DeviceContext_OMSetRenderTargets(p->ctx, 1, &tex_p->rtv, NULL);
+    ID3D11DeviceContext_OMSetBlendState(p->ctx, NULL, NULL,
+                                        D3D11_DEFAULT_SAMPLE_MASK);
+
+    ID3D11DeviceContext_Draw(p->ctx, 6, 0);
+
+    ID3D11DeviceContext_PSSetConstantBuffers(p->ctx, 0, 1,
+        &(ID3D11Buffer *){ NULL });
+    ID3D11DeviceContext_OMSetRenderTargets(p->ctx, 0, NULL, NULL);
+}
+
+static void clear(struct ra *ra, struct ra_tex *tex, float color[4],
+                  struct mp_rect *rc)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_tex *tex_p = tex->priv;
+    struct ra_tex_params *params = &tex->params;
+
+    if (!tex_p->rtv)
+        return;
+
+    if (rc->x0 || rc->y0 || rc->x1 != params->w || rc->y1 != params->h) {
+        if (p->has_clear_view) {
+            ID3D11DeviceContext1_ClearView(p->ctx1, (ID3D11View *)tex_p->rtv,
+                color, (&(D3D11_RECT) {
+                    .left = rc->x0,
+                    .top = rc->y0,
+                    .right = rc->x1,
+                    .bottom = rc->y1,
+                }), 1);
+        } else {
+            clear_rpass(ra, tex, color, rc);
+        }
+    } else {
+        ID3D11DeviceContext_ClearRenderTargetView(p->ctx, tex_p->rtv, color);
+    }
+}
+
+static bool setup_blit_rpass(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    ID3DBlob *vs_blob = NULL;
+    ID3DBlob *float_ps_blob = NULL;
+    HRESULT hr;
+
+    hr = p->D3DCompile(blit_vs, sizeof(blit_vs), NULL, NULL, NULL, "main",
+        get_shader_target(ra, GLSL_SHADER_VERTEX),
+        D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &vs_blob, NULL);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to compile blit() vertex shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = ID3D11Device_CreateVertexShader(p->dev,
+        ID3D10Blob_GetBufferPointer(vs_blob), ID3D10Blob_GetBufferSize(vs_blob),
+        NULL, &p->blit_vs);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create blit() vertex shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = p->D3DCompile(blit_float_ps, sizeof(blit_float_ps), NULL, NULL, NULL,
+        "main", get_shader_target(ra, GLSL_SHADER_FRAGMENT),
+        D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, &float_ps_blob, NULL);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to compile blit() pixel shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = ID3D11Device_CreatePixelShader(p->dev,
+        ID3D10Blob_GetBufferPointer(float_ps_blob),
+        ID3D10Blob_GetBufferSize(float_ps_blob),
+        NULL, &p->blit_float_ps);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create blit() pixel shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    D3D11_INPUT_ELEMENT_DESC in_descs[] = {
+        { "POSITION", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 0 },
+        { "TEXCOORD", 0, DXGI_FORMAT_R32G32_FLOAT, 0, 8 },
+    };
+    hr = ID3D11Device_CreateInputLayout(p->dev, in_descs,
+        MP_ARRAY_SIZE(in_descs), ID3D10Blob_GetBufferPointer(vs_blob),
+        ID3D10Blob_GetBufferSize(vs_blob), &p->blit_layout);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create blit() IA layout: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    D3D11_BUFFER_DESC vdesc = {
+        .ByteWidth = sizeof(struct blit_vert[6]),
+        .Usage = D3D11_USAGE_DEFAULT,
+        .BindFlags = D3D11_BIND_VERTEX_BUFFER,
+    };
+    hr = ID3D11Device_CreateBuffer(p->dev, &vdesc, NULL, &p->blit_vbuf);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create blit() vertex buffer: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    // Blit always uses point sampling, regardless of the source texture
+    D3D11_SAMPLER_DESC sdesc = {
+        .AddressU = D3D11_TEXTURE_ADDRESS_CLAMP,
+        .AddressV = D3D11_TEXTURE_ADDRESS_CLAMP,
+        .AddressW = D3D11_TEXTURE_ADDRESS_CLAMP,
+        .ComparisonFunc = D3D11_COMPARISON_NEVER,
+        .MinLOD = 0,
+        .MaxLOD = D3D11_FLOAT32_MAX,
+        .MaxAnisotropy = 1,
+    };
+    hr = ID3D11Device_CreateSamplerState(p->dev, &sdesc, &p->blit_sampler);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create blit() sampler: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(float_ps_blob);
+    return true;
+error:
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(float_ps_blob);
+    return false;
+}
+
+static void blit_rpass(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
+                       struct mp_rect *dst_rc, struct mp_rect *src_rc)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_tex *dst_p = dst->priv;
+    struct d3d_tex *src_p = src->priv;
+
+    float u_min = (double)src_rc->x0 / src->params.w;
+    float u_max = (double)src_rc->x1 / src->params.w;
+    float v_min = (double)src_rc->y0 / src->params.h;
+    float v_max = (double)src_rc->y1 / src->params.h;
+
+    struct blit_vert verts[6] = {
+        { .x = -1, .y = -1, .u = u_min, .v = v_max },
+        { .x =  1, .y = -1, .u = u_max, .v = v_max },
+        { .x =  1, .y =  1, .u = u_max, .v = v_min },
+        { .x = -1, .y =  1, .u = u_min, .v = v_min },
+    };
+    verts[4] = verts[0];
+    verts[5] = verts[2];
+    ID3D11DeviceContext_UpdateSubresource(p->ctx,
+        (ID3D11Resource *)p->blit_vbuf, 0, NULL, verts, 0, 0);
+
+    ID3D11DeviceContext_IASetInputLayout(p->ctx, p->blit_layout);
+    ID3D11DeviceContext_IASetVertexBuffers(p->ctx, 0, 1, &p->blit_vbuf,
+        &(UINT) { sizeof(verts[0]) }, &(UINT) { 0 });
+    ID3D11DeviceContext_IASetPrimitiveTopology(p->ctx,
+        D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+
+    ID3D11DeviceContext_VSSetShader(p->ctx, p->blit_vs, NULL, 0);
+
+    ID3D11DeviceContext_RSSetViewports(p->ctx, 1, (&(D3D11_VIEWPORT) {
+        .TopLeftX = dst_rc->x0,
+        .TopLeftY = dst_rc->y0,
+        .Width = mp_rect_w(*dst_rc),
+        .Height = mp_rect_h(*dst_rc),
+        .MinDepth = 0,
+        .MaxDepth = 1,
+    }));
+    ID3D11DeviceContext_RSSetScissorRects(p->ctx, 1, (&(D3D11_RECT) {
+        .left = dst_rc->x0,
+        .top = dst_rc->y0,
+        .right = dst_rc->x1,
+        .bottom = dst_rc->y1,
+    }));
+
+    ID3D11DeviceContext_PSSetShader(p->ctx, p->blit_float_ps, NULL, 0);
+    ID3D11DeviceContext_PSSetShaderResources(p->ctx, 0, 1, &src_p->srv);
+    ID3D11DeviceContext_PSSetSamplers(p->ctx, 0, 1, &p->blit_sampler);
+
+    ID3D11DeviceContext_OMSetRenderTargets(p->ctx, 1, &dst_p->rtv, NULL);
+    ID3D11DeviceContext_OMSetBlendState(p->ctx, NULL, NULL,
+                                        D3D11_DEFAULT_SAMPLE_MASK);
+
+    ID3D11DeviceContext_Draw(p->ctx, 6, 0);
+
+    ID3D11DeviceContext_PSSetShaderResources(p->ctx, 0, 1,
+        &(ID3D11ShaderResourceView *) { NULL });
+    ID3D11DeviceContext_PSSetSamplers(p->ctx, 0, 1,
+        &(ID3D11SamplerState *) { NULL });
+    ID3D11DeviceContext_OMSetRenderTargets(p->ctx, 0, NULL, NULL);
+}
+
+static void blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
+                 struct mp_rect *dst_rc_ptr, struct mp_rect *src_rc_ptr)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_tex *dst_p = dst->priv;
+    struct d3d_tex *src_p = src->priv;
+    struct mp_rect dst_rc = *dst_rc_ptr;
+    struct mp_rect src_rc = *src_rc_ptr;
+
+    assert(dst->params.dimensions == 2);
+    assert(src->params.dimensions == 2);
+
+    // A zero-sized target rectangle is a no-op
+    if (!mp_rect_w(dst_rc) || !mp_rect_h(dst_rc))
+        return;
+
+    // ra.h seems to imply that both dst_rc and src_rc can be flipped, but it's
+    // easier for blit_rpass() if only src_rc can be flipped, so unflip dst_rc.
+    if (dst_rc.x0 > dst_rc.x1) {
+        MPSWAP(int, dst_rc.x0, dst_rc.x1);
+        MPSWAP(int, src_rc.x0, src_rc.x1);
+    }
+    if (dst_rc.y0 > dst_rc.y1) {
+        MPSWAP(int, dst_rc.y0, dst_rc.y1);
+        MPSWAP(int, src_rc.y0, src_rc.y1);
+    }
+
+    // If format conversion, stretching or flipping is required, a renderpass
+    // must be used
+    if (dst->params.format != src->params.format ||
+        mp_rect_w(dst_rc) != mp_rect_w(src_rc) ||
+        mp_rect_h(dst_rc) != mp_rect_h(src_rc))
+    {
+        blit_rpass(ra, dst, src, &dst_rc, &src_rc);
+    } else {
+        int dst_sr = dst_p->array_slice >= 0 ? dst_p->array_slice : 0;
+        int src_sr = src_p->array_slice >= 0 ? src_p->array_slice : 0;
+        ID3D11DeviceContext_CopySubresourceRegion(p->ctx, dst_p->res, dst_sr,
+            dst_rc.x0, dst_rc.y0, 0, src_p->res, src_sr, (&(D3D11_BOX) {
+                .left = src_rc.x0,
+                .top = src_rc.y0,
+                .front = 0,
+                .right = src_rc.x1,
+                .bottom = src_rc.y1,
+                .back = 1,
+            }));
+    }
+}
+
+static int desc_namespace(enum ra_vartype type)
+{
+    // Images and SSBOs both use UAV bindings
+    if (type == RA_VARTYPE_IMG_W)
+        type = RA_VARTYPE_BUF_RW;
+    return type;
+}
+
+static bool compile_glsl(struct ra *ra, enum glsl_shader type,
+                         const char *glsl, ID3DBlob **out)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct spirv_compiler *spirv = p->spirv;
+    void *ta_ctx = talloc_new(NULL);
+    crossc_compiler *cross = NULL;
+    const char *hlsl = NULL;
+    ID3DBlob *errors = NULL;
+    bool success = false;
+    HRESULT hr;
+
+    int cross_shader_model;
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        cross_shader_model = 50;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_1) {
+        cross_shader_model = 41;
+    } else {
+        cross_shader_model = 40;
+    }
+
+    int64_t start_us = mp_time_us();
+
+    bstr spv_module;
+    if (!spirv->fns->compile_glsl(spirv, ta_ctx, type, glsl, &spv_module))
+        goto done;
+
+    int64_t shaderc_us = mp_time_us();
+
+    cross = crossc_hlsl_create((uint32_t*)spv_module.start,
+                               spv_module.len / sizeof(uint32_t));
+
+    crossc_hlsl_set_shader_model(cross, cross_shader_model);
+    crossc_set_flip_vert_y(cross, type == GLSL_SHADER_VERTEX);
+
+    hlsl = crossc_compile(cross);
+    if (!hlsl) {
+        MP_ERR(ra, "SPIRV-Cross failed: %s\n", crossc_strerror(cross));
+        goto done;
+    }
+
+    int64_t cross_us = mp_time_us();
+
+    hr = p->D3DCompile(hlsl, strlen(hlsl), NULL, NULL, NULL, "main",
+        get_shader_target(ra, type), D3DCOMPILE_OPTIMIZATION_LEVEL3, 0, out,
+        &errors);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "D3DCompile failed: %s\n%.*s", mp_HRESULT_to_str(hr),
+               (int)ID3D10Blob_GetBufferSize(errors),
+               (char*)ID3D10Blob_GetBufferPointer(errors));
+        goto done;
+    }
+
+    int64_t d3dcompile_us = mp_time_us();
+
+    MP_VERBOSE(ra, "Compiled a %s shader in %lldus\n", shader_type_name(type),
+               d3dcompile_us - start_us);
+    MP_VERBOSE(ra, "shaderc: %lldus, SPIRV-Cross: %lldus, D3DCompile: %lldus\n",
+               shaderc_us - start_us,
+               cross_us - shaderc_us,
+               d3dcompile_us - cross_us);
+
+    success = true;
+done:;
+    int level = success ? MSGL_DEBUG : MSGL_ERR;
+    MP_MSG(ra, level, "GLSL source:\n");
+    mp_log_source(ra->log, level, glsl);
+    if (hlsl) {
+        MP_MSG(ra, level, "HLSL source:\n");
+        mp_log_source(ra->log, level, hlsl);
+    }
+    SAFE_RELEASE(errors);
+    crossc_destroy(cross);
+    talloc_free(ta_ctx);
+    return success;
+}
+
+static void renderpass_destroy(struct ra *ra, struct ra_renderpass *pass)
+{
+    if (!pass)
+        return;
+    struct d3d_rpass *pass_p = pass->priv;
+
+    SAFE_RELEASE(pass_p->vs);
+    SAFE_RELEASE(pass_p->ps);
+    SAFE_RELEASE(pass_p->cs);
+    SAFE_RELEASE(pass_p->layout);
+    SAFE_RELEASE(pass_p->bstate);
+    talloc_free(pass);
+}
+
+static D3D11_BLEND map_ra_blend(enum ra_blend blend)
+{
+    switch (blend) {
+    default:
+    case RA_BLEND_ZERO:                return D3D11_BLEND_ZERO;
+    case RA_BLEND_ONE:                 return D3D11_BLEND_ONE;
+    case RA_BLEND_SRC_ALPHA:           return D3D11_BLEND_SRC_ALPHA;
+    case RA_BLEND_ONE_MINUS_SRC_ALPHA: return D3D11_BLEND_INV_SRC_ALPHA;
+    };
+}
+
+static size_t vbuf_upload(struct ra *ra, void *data, size_t size)
+{
+    struct ra_d3d11 *p = ra->priv;
+    HRESULT hr;
+
+    // Arbitrary size limit in case there is an insane number of vertices
+    if (size > 1e9) {
+        MP_ERR(ra, "Vertex buffer is too large\n");
+        return -1;
+    }
+
+    // If the vertex data doesn't fit, realloc the vertex buffer
+    if (size > p->vbuf_size) {
+        size_t new_size = p->vbuf_size;
+        // Arbitrary base size
+        if (!new_size)
+            new_size = 64 * 1024;
+        while (new_size < size)
+            new_size *= 2;
+
+        ID3D11Buffer *new_buf;
+        D3D11_BUFFER_DESC vbuf_desc = {
+            .ByteWidth = new_size,
+            .Usage = D3D11_USAGE_DYNAMIC,
+            .BindFlags = D3D11_BIND_VERTEX_BUFFER,
+            .CPUAccessFlags = D3D11_CPU_ACCESS_WRITE,
+        };
+        hr = ID3D11Device_CreateBuffer(p->dev, &vbuf_desc, NULL, &new_buf);
+        if (FAILED(hr)) {
+            MP_ERR(ra, "Failed to create vertex buffer: %s\n",
+                   mp_HRESULT_to_str(hr));
+            return -1;
+        }
+
+        SAFE_RELEASE(p->vbuf);
+        p->vbuf = new_buf;
+        p->vbuf_size = new_size;
+        p->vbuf_used = 0;
+    }
+
+    bool discard = false;
+    size_t offset = p->vbuf_used;
+    if (offset + size > p->vbuf_size) {
+        // We reached the end of the buffer, so discard and wrap around
+        discard = true;
+        offset = 0;
+    }
+
+    D3D11_MAPPED_SUBRESOURCE map = { 0 };
+    hr = ID3D11DeviceContext_Map(p->ctx, (ID3D11Resource *)p->vbuf, 0,
+        discard ? D3D11_MAP_WRITE_DISCARD : D3D11_MAP_WRITE_NO_OVERWRITE,
+        0, &map);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to map vertex buffer: %s\n", mp_HRESULT_to_str(hr));
+        return -1;
+    }
+
+    char *cdata = map.pData;
+    memcpy(cdata + offset, data, size);
+
+    ID3D11DeviceContext_Unmap(p->ctx, (ID3D11Resource *)p->vbuf, 0);
+
+    p->vbuf_used = offset + size;
+    return offset;
+}
+
+static const char cache_magic[4] = "RD11";
+static const int cache_version = 2;
+
+struct cache_header {
+    char magic[sizeof(cache_magic)];
+    int cache_version;
+    char compiler[SPIRV_NAME_MAX_LEN];
+    int spv_compiler_version;
+    uint32_t cross_version;
+    struct dll_version d3d_compiler_version;
+    int feature_level;
+    size_t vert_bytecode_len;
+    size_t frag_bytecode_len;
+    size_t comp_bytecode_len;
+};
+
+static void load_cached_program(struct ra *ra,
+                                const struct ra_renderpass_params *params,
+                                bstr *vert_bc,
+                                bstr *frag_bc,
+                                bstr *comp_bc)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct spirv_compiler *spirv = p->spirv;
+    bstr cache = params->cached_program;
+
+    if (cache.len < sizeof(struct cache_header))
+        return;
+
+    struct cache_header *header = (struct cache_header *)cache.start;
+    cache = bstr_cut(cache, sizeof(*header));
+
+    if (strncmp(header->magic, cache_magic, sizeof(cache_magic)) != 0)
+        return;
+    if (header->cache_version != cache_version)
+        return;
+    if (strncmp(header->compiler, spirv->name, sizeof(header->compiler)) != 0)
+        return;
+    if (header->spv_compiler_version != spirv->compiler_version)
+        return;
+    if (header->cross_version != crossc_version())
+        return;
+    if (!dll_version_equal(header->d3d_compiler_version, p->d3d_compiler_ver))
+        return;
+    if (header->feature_level != p->fl)
+        return;
+
+    if (header->vert_bytecode_len && vert_bc) {
+        *vert_bc = bstr_splice(cache, 0, header->vert_bytecode_len);
+        MP_VERBOSE(ra, "Using cached vertex shader\n");
+    }
+    cache = bstr_cut(cache, header->vert_bytecode_len);
+
+    if (header->frag_bytecode_len && frag_bc) {
+        *frag_bc = bstr_splice(cache, 0, header->frag_bytecode_len);
+        MP_VERBOSE(ra, "Using cached fragment shader\n");
+    }
+    cache = bstr_cut(cache, header->frag_bytecode_len);
+
+    if (header->comp_bytecode_len && comp_bc) {
+        *comp_bc = bstr_splice(cache, 0, header->comp_bytecode_len);
+        MP_VERBOSE(ra, "Using cached compute shader\n");
+    }
+    cache = bstr_cut(cache, header->comp_bytecode_len);
+}
+
+static void save_cached_program(struct ra *ra, struct ra_renderpass *pass,
+                                bstr vert_bc,
+                                bstr frag_bc,
+                                bstr comp_bc)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct spirv_compiler *spirv = p->spirv;
+
+    struct cache_header header = {
+        .cache_version = cache_version,
+        .spv_compiler_version = p->spirv->compiler_version,
+        .cross_version = crossc_version(),
+        .d3d_compiler_version = p->d3d_compiler_ver,
+        .feature_level = p->fl,
+        .vert_bytecode_len = vert_bc.len,
+        .frag_bytecode_len = frag_bc.len,
+        .comp_bytecode_len = comp_bc.len,
+    };
+    strncpy(header.magic, cache_magic, sizeof(header.magic));
+    strncpy(header.compiler, spirv->name, sizeof(header.compiler));
+
+    struct bstr *prog = &pass->params.cached_program;
+    bstr_xappend(pass, prog, (bstr){ (char *) &header, sizeof(header) });
+    bstr_xappend(pass, prog, vert_bc);
+    bstr_xappend(pass, prog, frag_bc);
+    bstr_xappend(pass, prog, comp_bc);
+}
+
+static struct ra_renderpass *renderpass_create_raster(struct ra *ra,
+    struct ra_renderpass *pass, const struct ra_renderpass_params *params)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_rpass *pass_p = pass->priv;
+    ID3DBlob *vs_blob = NULL;
+    ID3DBlob *ps_blob = NULL;
+    HRESULT hr;
+
+    // load_cached_program will load compiled shader bytecode into vert_bc and
+    // frag_bc if the cache is valid. If not, vert_bc/frag_bc will remain NULL.
+    bstr vert_bc = {0};
+    bstr frag_bc = {0};
+    load_cached_program(ra, params, &vert_bc, &frag_bc, NULL);
+
+    if (!vert_bc.start) {
+        if (!compile_glsl(ra, GLSL_SHADER_VERTEX, params->vertex_shader,
+                          &vs_blob))
+            goto error;
+        vert_bc = (bstr){
+            ID3D10Blob_GetBufferPointer(vs_blob),
+            ID3D10Blob_GetBufferSize(vs_blob),
+        };
+    }
+
+    hr = ID3D11Device_CreateVertexShader(p->dev, vert_bc.start, vert_bc.len,
+                                         NULL, &pass_p->vs);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create vertex shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    if (!frag_bc.start) {
+        if (!compile_glsl(ra, GLSL_SHADER_FRAGMENT, params->frag_shader,
+                          &ps_blob))
+            goto error;
+        frag_bc = (bstr){
+            ID3D10Blob_GetBufferPointer(ps_blob),
+            ID3D10Blob_GetBufferSize(ps_blob),
+        };
+    }
+
+    hr = ID3D11Device_CreatePixelShader(p->dev, frag_bc.start, frag_bc.len,
+                                        NULL, &pass_p->ps);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create pixel shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    D3D11_INPUT_ELEMENT_DESC *in_descs = talloc_array(pass,
+        D3D11_INPUT_ELEMENT_DESC, params->num_vertex_attribs);
+    for (int i = 0; i < params->num_vertex_attribs; i++) {
+        struct ra_renderpass_input *inp = &params->vertex_attribs[i];
+
+        DXGI_FORMAT fmt = DXGI_FORMAT_UNKNOWN;
+        switch (inp->type) {
+        case RA_VARTYPE_FLOAT:
+            switch (inp->dim_v) {
+            case 1: fmt = DXGI_FORMAT_R32_FLOAT;          break;
+            case 2: fmt = DXGI_FORMAT_R32G32_FLOAT;       break;
+            case 3: fmt = DXGI_FORMAT_R32G32B32_FLOAT;    break;
+            case 4: fmt = DXGI_FORMAT_R32G32B32A32_FLOAT; break;
+            }
+            break;
+        case RA_VARTYPE_BYTE_UNORM:
+            switch (inp->dim_v) {
+            case 1: fmt = DXGI_FORMAT_R8_UNORM;       break;
+            case 2: fmt = DXGI_FORMAT_R8G8_UNORM;     break;
+            // There is no 3-component 8-bit DXGI format
+            case 4: fmt = DXGI_FORMAT_R8G8B8A8_UNORM; break;
+            }
+            break;
+        }
+        if (fmt == DXGI_FORMAT_UNKNOWN) {
+            MP_ERR(ra, "Could not find suitable vertex input format\n");
+            goto error;
+        }
+
+        in_descs[i] = (D3D11_INPUT_ELEMENT_DESC) {
+            // The semantic name doesn't mean much and is just used to verify
+            // the input description matches the shader. SPIRV-Cross always
+            // uses TEXCOORD, so we should too.
+            .SemanticName = "TEXCOORD",
+            .SemanticIndex = i,
+            .AlignedByteOffset = inp->offset,
+            .Format = fmt,
+        };
+    }
+
+    hr = ID3D11Device_CreateInputLayout(p->dev, in_descs,
+        params->num_vertex_attribs, vert_bc.start, vert_bc.len,
+        &pass_p->layout);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create IA layout: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+    talloc_free(in_descs);
+    in_descs = NULL;
+
+    D3D11_BLEND_DESC bdesc = {
+        .RenderTarget[0] = {
+            .BlendEnable = params->enable_blend,
+            .SrcBlend = map_ra_blend(params->blend_src_rgb),
+            .DestBlend = map_ra_blend(params->blend_dst_rgb),
+            .BlendOp = D3D11_BLEND_OP_ADD,
+            .SrcBlendAlpha = map_ra_blend(params->blend_src_alpha),
+            .DestBlendAlpha = map_ra_blend(params->blend_dst_alpha),
+            .BlendOpAlpha = D3D11_BLEND_OP_ADD,
+            .RenderTargetWriteMask = D3D11_COLOR_WRITE_ENABLE_ALL,
+        },
+    };
+    hr = ID3D11Device_CreateBlendState(p->dev, &bdesc, &pass_p->bstate);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create blend state: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    save_cached_program(ra, pass, vert_bc, frag_bc, (bstr){0});
+
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(ps_blob);
+    return pass;
+
+error:
+    renderpass_destroy(ra, pass);
+    SAFE_RELEASE(vs_blob);
+    SAFE_RELEASE(ps_blob);
+    return NULL;
+}
+
+static struct ra_renderpass *renderpass_create_compute(struct ra *ra,
+    struct ra_renderpass *pass, const struct ra_renderpass_params *params)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_rpass *pass_p = pass->priv;
+    ID3DBlob *cs_blob = NULL;
+    HRESULT hr;
+
+    bstr comp_bc = {0};
+    load_cached_program(ra, params, NULL, NULL, &comp_bc);
+
+    if (!comp_bc.start) {
+        if (!compile_glsl(ra, GLSL_SHADER_COMPUTE, params->compute_shader,
+                          &cs_blob))
+            goto error;
+        comp_bc = (bstr){
+            ID3D10Blob_GetBufferPointer(cs_blob),
+            ID3D10Blob_GetBufferSize(cs_blob),
+        };
+    }
+    hr = ID3D11Device_CreateComputeShader(p->dev, comp_bc.start, comp_bc.len,
+                                          NULL, &pass_p->cs);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create compute shader: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    save_cached_program(ra, pass, (bstr){0}, (bstr){0}, comp_bc);
+
+    SAFE_RELEASE(cs_blob);
+    return pass;
+error:
+    renderpass_destroy(ra, pass);
+    SAFE_RELEASE(cs_blob);
+    return NULL;
+}
+
+static struct ra_renderpass *renderpass_create(struct ra *ra,
+    const struct ra_renderpass_params *params)
+{
+    struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass);
+    pass->params = *ra_renderpass_params_copy(pass, params);
+    pass->params.cached_program = (bstr){0};
+    pass->priv = talloc_zero(pass, struct d3d_rpass);
+
+    if (params->type == RA_RENDERPASS_TYPE_COMPUTE) {
+        return renderpass_create_compute(ra, pass, params);
+    } else {
+        return renderpass_create_raster(ra, pass, params);
+    }
+}
+
+static void renderpass_run_raster(struct ra *ra,
+                                  const struct ra_renderpass_run_params *params,
+                                  ID3D11Buffer *ubos[], int ubos_len,
+                                  ID3D11SamplerState *samplers[],
+                                  ID3D11ShaderResourceView *srvs[],
+                                  int samplers_len,
+                                  ID3D11UnorderedAccessView *uavs[],
+                                  int uavs_len)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct ra_renderpass *pass = params->pass;
+    struct d3d_rpass *pass_p = pass->priv;
+
+    UINT vbuf_offset = vbuf_upload(ra, params->vertex_data,
+        pass->params.vertex_stride * params->vertex_count);
+    if (vbuf_offset == (UINT)-1)
+        return;
+
+    ID3D11DeviceContext_IASetInputLayout(p->ctx, pass_p->layout);
+    ID3D11DeviceContext_IASetVertexBuffers(p->ctx, 0, 1, &p->vbuf,
+        &pass->params.vertex_stride, &vbuf_offset);
+    ID3D11DeviceContext_IASetPrimitiveTopology(p->ctx,
+        D3D11_PRIMITIVE_TOPOLOGY_TRIANGLELIST);
+
+    ID3D11DeviceContext_VSSetShader(p->ctx, pass_p->vs, NULL, 0);
+
+    ID3D11DeviceContext_RSSetViewports(p->ctx, 1, (&(D3D11_VIEWPORT) {
+        .TopLeftX = params->viewport.x0,
+        .TopLeftY = params->viewport.y0,
+        .Width = mp_rect_w(params->viewport),
+        .Height = mp_rect_h(params->viewport),
+        .MinDepth = 0,
+        .MaxDepth = 1,
+    }));
+    ID3D11DeviceContext_RSSetScissorRects(p->ctx, 1, (&(D3D11_RECT) {
+        .left = params->scissors.x0,
+        .top = params->scissors.y0,
+        .right = params->scissors.x1,
+        .bottom = params->scissors.y1,
+    }));
+    ID3D11DeviceContext_PSSetShader(p->ctx, pass_p->ps, NULL, 0);
+    ID3D11DeviceContext_PSSetConstantBuffers(p->ctx, 0, ubos_len, ubos);
+    ID3D11DeviceContext_PSSetShaderResources(p->ctx, 0, samplers_len, srvs);
+    ID3D11DeviceContext_PSSetSamplers(p->ctx, 0, samplers_len, samplers);
+
+    struct ra_tex *target = params->target;
+    struct d3d_tex *target_p = target->priv;
+    ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(p->ctx, 1,
+        &target_p->rtv, NULL, 1, uavs_len, uavs, NULL);
+    ID3D11DeviceContext_OMSetBlendState(p->ctx, pass_p->bstate, NULL,
+                                        D3D11_DEFAULT_SAMPLE_MASK);
+
+    ID3D11DeviceContext_Draw(p->ctx, params->vertex_count, 0);
+
+    // Unbind everything. It's easier to do this than to actually track state,
+    // and if we leave the RTV bound, it could trip up D3D's conflict checker.
+    for (int i = 0; i < ubos_len; i++)
+        ubos[i] = NULL;
+    for (int i = 0; i < samplers_len; i++) {
+        samplers[i] = NULL;
+        srvs[i] = NULL;
+    }
+    for (int i = 0; i < uavs_len; i++)
+        uavs[i] = NULL;
+    ID3D11DeviceContext_PSSetConstantBuffers(p->ctx, 0, ubos_len, ubos);
+    ID3D11DeviceContext_PSSetShaderResources(p->ctx, 0, samplers_len, srvs);
+    ID3D11DeviceContext_PSSetSamplers(p->ctx, 0, samplers_len, samplers);
+    ID3D11DeviceContext_OMSetRenderTargetsAndUnorderedAccessViews(p->ctx, 0,
+        NULL, NULL, 1, uavs_len, uavs, NULL);
+}
+
+static void renderpass_run_compute(struct ra *ra,
+                                   const struct ra_renderpass_run_params *params,
+                                   ID3D11Buffer *ubos[], int ubos_len,
+                                   ID3D11SamplerState *samplers[],
+                                   ID3D11ShaderResourceView *srvs[],
+                                   int samplers_len,
+                                   ID3D11UnorderedAccessView *uavs[],
+                                   int uavs_len)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct ra_renderpass *pass = params->pass;
+    struct d3d_rpass *pass_p = pass->priv;
+
+    ID3D11DeviceContext_CSSetShader(p->ctx, pass_p->cs, NULL, 0);
+    ID3D11DeviceContext_CSSetConstantBuffers(p->ctx, 0, ubos_len, ubos);
+    ID3D11DeviceContext_CSSetShaderResources(p->ctx, 0, samplers_len, srvs);
+    ID3D11DeviceContext_CSSetSamplers(p->ctx, 0, samplers_len, samplers);
+    ID3D11DeviceContext_CSSetUnorderedAccessViews(p->ctx, 0, uavs_len, uavs,
+                                                  NULL);
+
+    ID3D11DeviceContext_Dispatch(p->ctx, params->compute_groups[0],
+                                         params->compute_groups[1],
+                                         params->compute_groups[2]);
+
+    for (int i = 0; i < ubos_len; i++)
+        ubos[i] = NULL;
+    for (int i = 0; i < samplers_len; i++) {
+        samplers[i] = NULL;
+        srvs[i] = NULL;
+    }
+    for (int i = 0; i < uavs_len; i++)
+        uavs[i] = NULL;
+    ID3D11DeviceContext_CSSetConstantBuffers(p->ctx, 0, ubos_len, ubos);
+    ID3D11DeviceContext_CSSetShaderResources(p->ctx, 0, samplers_len, srvs);
+    ID3D11DeviceContext_CSSetSamplers(p->ctx, 0, samplers_len, samplers);
+    ID3D11DeviceContext_CSSetUnorderedAccessViews(p->ctx, 0, uavs_len, uavs,
+                                                  NULL);
+}
+
+static void renderpass_run(struct ra *ra,
+                           const struct ra_renderpass_run_params *params)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct ra_renderpass *pass = params->pass;
+    enum ra_renderpass_type type = pass->params.type;
+
+    ID3D11Buffer *ubos[D3D11_COMMONSHADER_CONSTANT_BUFFER_API_SLOT_COUNT] = {0};
+    int ubos_len = 0;
+
+    ID3D11SamplerState *samplers[D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT] = {0};
+    ID3D11ShaderResourceView *srvs[D3D11_COMMONSHADER_SAMPLER_SLOT_COUNT] = {0};
+    int samplers_len = 0;
+
+    ID3D11UnorderedAccessView *uavs[D3D11_1_UAV_SLOT_COUNT] = {0};
+    int uavs_len = 0;
+
+    // In a raster pass, one of the UAV slots is used by the runtime for the RTV
+    int uavs_max = type == RA_RENDERPASS_TYPE_COMPUTE ? p->max_uavs
+                                                      : p->max_uavs - 1;
+
+    // Gather the input variables used in this pass. These will be mapped to
+    // HLSL registers.
+    for (int i = 0; i < params->num_values; i++) {
+        struct ra_renderpass_input_val *val = &params->values[i];
+        int binding = pass->params.inputs[val->index].binding;
+        switch (pass->params.inputs[val->index].type) {
+        case RA_VARTYPE_BUF_RO:
+            if (binding > MP_ARRAY_SIZE(ubos)) {
+                MP_ERR(ra, "Too many constant buffers in pass\n");
+                return;
+            }
+            struct ra_buf *buf_ro = *(struct ra_buf **)val->data;
+            buf_resolve(ra, buf_ro);
+            struct d3d_buf *buf_ro_p = buf_ro->priv;
+            ubos[binding] = buf_ro_p->buf;
+            ubos_len = MPMAX(ubos_len, binding + 1);
+            break;
+        case RA_VARTYPE_BUF_RW:
+            if (binding > uavs_max) {
+                MP_ERR(ra, "Too many UAVs in pass\n");
+                return;
+            }
+            struct ra_buf *buf_rw = *(struct ra_buf **)val->data;
+            buf_resolve(ra, buf_rw);
+            struct d3d_buf *buf_rw_p = buf_rw->priv;
+            uavs[binding] = buf_rw_p->uav;
+            uavs_len = MPMAX(uavs_len, binding + 1);
+            break;
+        case RA_VARTYPE_TEX:
+            if (binding > MP_ARRAY_SIZE(samplers)) {
+                MP_ERR(ra, "Too many textures in pass\n");
+                return;
+            }
+            struct ra_tex *tex = *(struct ra_tex **)val->data;
+            struct d3d_tex *tex_p = tex->priv;
+            samplers[binding] = tex_p->sampler;
+            srvs[binding] = tex_p->srv;
+            samplers_len = MPMAX(samplers_len, binding + 1);
+            break;
+        case RA_VARTYPE_IMG_W:
+            if (binding > uavs_max) {
+                MP_ERR(ra, "Too many UAVs in pass\n");
+                return;
+            }
+            struct ra_tex *img = *(struct ra_tex **)val->data;
+            struct d3d_tex *img_p = img->priv;
+            uavs[binding] = img_p->uav;
+            uavs_len = MPMAX(uavs_len, binding + 1);
+            break;
+        }
+    }
+
+    if (type == RA_RENDERPASS_TYPE_COMPUTE) {
+        renderpass_run_compute(ra, params, ubos, ubos_len, samplers, srvs,
+                               samplers_len, uavs, uavs_len);
+    } else {
+        renderpass_run_raster(ra, params, ubos, ubos_len, samplers, srvs,
+                              samplers_len, uavs, uavs_len);
+    }
+}
+
+static void timer_destroy(struct ra *ra, ra_timer *ratimer)
+{
+    if (!ratimer)
+        return;
+    struct d3d_timer *timer = ratimer;
+
+    SAFE_RELEASE(timer->ts_start);
+    SAFE_RELEASE(timer->ts_end);
+    SAFE_RELEASE(timer->disjoint);
+    talloc_free(timer);
+}
+
+static ra_timer *timer_create(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    if (!p->has_timestamp_queries)
+        return NULL;
+
+    struct d3d_timer *timer = talloc_zero(NULL, struct d3d_timer);
+    HRESULT hr;
+
+    hr = ID3D11Device_CreateQuery(p->dev,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &timer->ts_start);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create start query: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    hr = ID3D11Device_CreateQuery(p->dev,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, &timer->ts_end);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create end query: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    // Measuring duration in D3D11 requires three queries: start and end
+    // timestamps, and a disjoint query containing a flag which says whether
+    // the timestamps are usable or if a discontinuity occured between them,
+    // like a change in power state or clock speed. The disjoint query also
+    // contains the timer frequency, so the timestamps are useless without it.
+    hr = ID3D11Device_CreateQuery(p->dev,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP_DISJOINT }, &timer->disjoint);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create timer query: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+
+    return timer;
+error:
+    timer_destroy(ra, timer);
+    return NULL;
+}
+
+static uint64_t timestamp_to_ns(uint64_t timestamp, uint64_t freq)
+{
+    static const uint64_t ns_per_s = 1000000000llu;
+    return timestamp / freq * ns_per_s + timestamp % freq * ns_per_s / freq;
+}
+
+static uint64_t timer_get_result(struct ra *ra, ra_timer *ratimer)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_timer *timer = ratimer;
+    HRESULT hr;
+
+    UINT64 start, end;
+    D3D11_QUERY_DATA_TIMESTAMP_DISJOINT dj;
+
+    hr = ID3D11DeviceContext_GetData(p->ctx,
+        (ID3D11Asynchronous *)timer->ts_end, &end, sizeof(end),
+        D3D11_ASYNC_GETDATA_DONOTFLUSH);
+    if (FAILED(hr) || hr == S_FALSE)
+        return 0;
+    hr = ID3D11DeviceContext_GetData(p->ctx,
+        (ID3D11Asynchronous *)timer->ts_start, &start, sizeof(start),
+        D3D11_ASYNC_GETDATA_DONOTFLUSH);
+    if (FAILED(hr) || hr == S_FALSE)
+        return 0;
+    hr = ID3D11DeviceContext_GetData(p->ctx,
+        (ID3D11Asynchronous *)timer->disjoint, &dj, sizeof(dj),
+        D3D11_ASYNC_GETDATA_DONOTFLUSH);
+    if (FAILED(hr) || hr == S_FALSE || dj.Disjoint || !dj.Frequency)
+        return 0;
+
+    return timestamp_to_ns(end - start, dj.Frequency);
+}
+
+static void timer_start(struct ra *ra, ra_timer *ratimer)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_timer *timer = ratimer;
+
+    // Latch the last result of this ra_timer (returned by timer_stop)
+    timer->result = timer_get_result(ra, ratimer);
+
+    ID3D11DeviceContext_Begin(p->ctx, (ID3D11Asynchronous *)timer->disjoint);
+    ID3D11DeviceContext_End(p->ctx, (ID3D11Asynchronous *)timer->ts_start);
+}
+
+static uint64_t timer_stop(struct ra *ra, ra_timer *ratimer)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct d3d_timer *timer = ratimer;
+
+    ID3D11DeviceContext_End(p->ctx, (ID3D11Asynchronous *)timer->ts_end);
+    ID3D11DeviceContext_End(p->ctx, (ID3D11Asynchronous *)timer->disjoint);
+
+    return timer->result;
+}
+
+static int map_msg_severity(D3D11_MESSAGE_SEVERITY sev)
+{
+    switch (sev) {
+    case D3D11_MESSAGE_SEVERITY_CORRUPTION:
+        return MSGL_FATAL;
+    case D3D11_MESSAGE_SEVERITY_ERROR:
+        return MSGL_ERR;
+    case D3D11_MESSAGE_SEVERITY_WARNING:
+        return MSGL_WARN;
+    default:
+    case D3D11_MESSAGE_SEVERITY_INFO:
+    case D3D11_MESSAGE_SEVERITY_MESSAGE:
+        return MSGL_DEBUG;
+    }
+}
+
+static void debug_marker(struct ra *ra, const char *msg)
+{
+    struct ra_d3d11 *p = ra->priv;
+    void *talloc_ctx = talloc_new(NULL);
+    HRESULT hr;
+
+    if (!p->iqueue)
+        goto done;
+
+    // Copy debug-layer messages to mpv's log output
+    bool printed_header = false;
+    uint64_t messages = ID3D11InfoQueue_GetNumStoredMessages(p->iqueue);
+    for (uint64_t i = 0; i < messages; i++) {
+        size_t len;
+        hr = ID3D11InfoQueue_GetMessage(p->iqueue, i, NULL, &len);
+        if (FAILED(hr) || !len)
+            goto done;
+
+        D3D11_MESSAGE *d3dmsg = talloc_size(talloc_ctx, len);
+        hr = ID3D11InfoQueue_GetMessage(p->iqueue, i, d3dmsg, &len);
+        if (FAILED(hr))
+            goto done;
+
+        int msgl = map_msg_severity(d3dmsg->Severity);
+        if (mp_msg_test(ra->log, msgl)) {
+            if (!printed_header)
+                MP_INFO(ra, "%s:\n", msg);
+            printed_header = true;
+
+            MP_MSG(ra, msgl, "%d: %.*s\n", (int)d3dmsg->ID,
+                (int)d3dmsg->DescriptionByteLength, d3dmsg->pDescription);
+            talloc_free(d3dmsg);
+        }
+    }
+
+    ID3D11InfoQueue_ClearStoredMessages(p->iqueue);
+done:
+    talloc_free(talloc_ctx);
+}
+
+static void destroy(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+
+    // Release everything except the interfaces needed to perform leak checking
+    SAFE_RELEASE(p->clear_ps);
+    SAFE_RELEASE(p->clear_vs);
+    SAFE_RELEASE(p->clear_layout);
+    SAFE_RELEASE(p->clear_vbuf);
+    SAFE_RELEASE(p->clear_cbuf);
+    SAFE_RELEASE(p->blit_float_ps);
+    SAFE_RELEASE(p->blit_vs);
+    SAFE_RELEASE(p->blit_layout);
+    SAFE_RELEASE(p->blit_vbuf);
+    SAFE_RELEASE(p->blit_sampler);
+    SAFE_RELEASE(p->vbuf);
+    SAFE_RELEASE(p->ctx1);
+    SAFE_RELEASE(p->dev1);
+    SAFE_RELEASE(p->dev);
+
+    if (p->debug && p->ctx) {
+        // Destroy the device context synchronously so referenced objects don't
+        // show up in the leak check
+        ID3D11DeviceContext_ClearState(p->ctx);
+        ID3D11DeviceContext_Flush(p->ctx);
+    }
+    SAFE_RELEASE(p->ctx);
+
+    if (p->debug) {
+        // Report any leaked objects
+        debug_marker(ra, "after destroy");
+        ID3D11Debug_ReportLiveDeviceObjects(p->debug, D3D11_RLDO_DETAIL);
+        debug_marker(ra, "after leak check");
+        ID3D11Debug_ReportLiveDeviceObjects(p->debug, D3D11_RLDO_SUMMARY);
+        debug_marker(ra, "after leak summary");
+    }
+    SAFE_RELEASE(p->debug);
+    SAFE_RELEASE(p->iqueue);
+
+    talloc_free(ra);
+}
+
+static struct ra_fns ra_fns_d3d11 = {
+    .destroy            = destroy,
+    .tex_create         = tex_create,
+    .tex_destroy        = tex_destroy,
+    .tex_upload         = tex_upload,
+    .buf_create         = buf_create,
+    .buf_destroy        = buf_destroy,
+    .buf_update         = buf_update,
+    .clear              = clear,
+    .blit               = blit,
+    .uniform_layout     = std140_layout,
+    .desc_namespace     = desc_namespace,
+    .renderpass_create  = renderpass_create,
+    .renderpass_destroy = renderpass_destroy,
+    .renderpass_run     = renderpass_run,
+    .timer_create       = timer_create,
+    .timer_destroy      = timer_destroy,
+    .timer_start        = timer_start,
+    .timer_stop         = timer_stop,
+    .debug_marker       = debug_marker,
+};
+
+void ra_d3d11_flush(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    ID3D11DeviceContext_Flush(p->ctx);
+}
+
+static void init_debug_layer(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    HRESULT hr;
+
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Debug,
+                                     (void**)&p->debug);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to get debug device: %s\n", mp_HRESULT_to_str(hr));
+        return;
+    }
+
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11InfoQueue,
+                                     (void**)&p->iqueue);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to get info queue: %s\n", mp_HRESULT_to_str(hr));
+        return;
+    }
+
+    // Store an unlimited amount of messages in the buffer. This is fine
+    // because we flush stored messages regularly (in debug_marker.)
+    ID3D11InfoQueue_SetMessageCountLimit(p->iqueue, -1);
+
+    // Filter some annoying messages
+    D3D11_MESSAGE_ID deny_ids[] = {
+        // This error occurs during context creation when we try to figure out
+        // the real maximum texture size by attempting to create a texture
+        // larger than the current feature level allows.
+        D3D11_MESSAGE_ID_CREATETEXTURE2D_INVALIDDIMENSIONS,
+
+        // These are normal. The RA timer queue habitually reuses timer objects
+        // without retrieving the results.
+        D3D11_MESSAGE_ID_QUERY_BEGIN_ABANDONING_PREVIOUS_RESULTS,
+        D3D11_MESSAGE_ID_QUERY_END_ABANDONING_PREVIOUS_RESULTS,
+    };
+    D3D11_INFO_QUEUE_FILTER filter = {
+        .DenyList = {
+            .NumIDs = MP_ARRAY_SIZE(deny_ids),
+            .pIDList = deny_ids,
+        },
+    };
+    ID3D11InfoQueue_PushStorageFilter(p->iqueue, &filter);
+}
+
+static struct dll_version get_dll_version(HMODULE dll)
+{
+    void *ctx = talloc_new(NULL);
+    struct dll_version ret = { 0 };
+
+    HRSRC rsrc = FindResourceW(dll, MAKEINTRESOURCEW(VS_VERSION_INFO),
+                               MAKEINTRESOURCEW(VS_FILE_INFO));
+    if (!rsrc)
+        goto done;
+    DWORD size = SizeofResource(dll, rsrc);
+    HGLOBAL res = LoadResource(dll, rsrc);
+    if (!res)
+        goto done;
+    void *ptr = LockResource(res);
+    if (!ptr)
+        goto done;
+    void *copy = talloc_memdup(ctx, ptr, size);
+
+    VS_FIXEDFILEINFO *ffi;
+    UINT ffi_len;
+    if (!VerQueryValueW(copy, L"\\", (void**)&ffi, &ffi_len))
+        goto done;
+    if (ffi_len < sizeof(*ffi))
+        goto done;
+
+    ret.major = HIWORD(ffi->dwFileVersionMS);
+    ret.minor = LOWORD(ffi->dwFileVersionMS);
+    ret.build = HIWORD(ffi->dwFileVersionLS);
+    ret.revision = LOWORD(ffi->dwFileVersionLS);
+
+done:
+    talloc_free(ctx);
+    return ret;
+}
+
+static bool load_d3d_compiler(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    HMODULE d3dcompiler = NULL;
+
+    // Try the inbox D3DCompiler first (Windows 8.1 and up)
+    if (IsWindows8Point1OrGreater()) {
+        d3dcompiler = LoadLibraryExW(L"d3dcompiler_47.dll", NULL,
+                                     LOAD_LIBRARY_SEARCH_SYSTEM32);
+    }
+    // Check for a packaged version of d3dcompiler_47.dll
+    if (!d3dcompiler)
+        d3dcompiler = LoadLibraryW(L"d3dcompiler_47.dll");
+    // Try d3dcompiler_46.dll from the Windows 8 SDK
+    if (!d3dcompiler)
+        d3dcompiler = LoadLibraryW(L"d3dcompiler_46.dll");
+    // Try d3dcompiler_43.dll from the June 2010 DirectX SDK
+    if (!d3dcompiler)
+        d3dcompiler = LoadLibraryW(L"d3dcompiler_43.dll");
+    // Can't find any compiler DLL, so give up
+    if (!d3dcompiler)
+        return false;
+
+    p->d3d_compiler_ver = get_dll_version(d3dcompiler);
+
+    p->D3DCompile = (pD3DCompile)GetProcAddress(d3dcompiler, "D3DCompile");
+    if (!p->D3DCompile)
+        return false;
+    return true;
+}
+
+static void find_max_texture_dimension(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+
+    D3D11_TEXTURE2D_DESC desc = {
+        .Width = ra->max_texture_wh,
+        .Height = ra->max_texture_wh,
+        .MipLevels = 1,
+        .ArraySize = 1,
+        .SampleDesc.Count = 1,
+        .Format = DXGI_FORMAT_R8_UNORM,
+        .BindFlags = D3D11_BIND_SHADER_RESOURCE,
+    };
+    while (true) {
+        desc.Height = desc.Width *= 2;
+        if (desc.Width >= 0x8000000u)
+            return;
+        if (FAILED(ID3D11Device_CreateTexture2D(p->dev, &desc, NULL, NULL)))
+            return;
+        ra->max_texture_wh = desc.Width;
+    }
+}
+
+struct ra *ra_d3d11_create(ID3D11Device *dev, struct mp_log *log,
+                           struct spirv_compiler *spirv)
+{
+    HRESULT hr;
+
+    struct ra *ra = talloc_zero(NULL, struct ra);
+    ra->log = log;
+    ra->fns = &ra_fns_d3d11;
+
+    // Even Direct3D 10level9 supports 3D textures
+    ra->caps = RA_CAP_TEX_3D | RA_CAP_DIRECT_UPLOAD | RA_CAP_BUF_RO |
+               RA_CAP_BLIT | spirv->ra_caps;
+
+    ra->glsl_version = spirv->glsl_version;
+    ra->glsl_vulkan = true;
+
+    struct ra_d3d11 *p = ra->priv = talloc_zero(ra, struct ra_d3d11);
+    p->spirv = spirv;
+
+    int minor = 0;
+    ID3D11Device_AddRef(dev);
+    p->dev = dev;
+    ID3D11Device_GetImmediateContext(p->dev, &p->ctx);
+    hr = ID3D11Device_QueryInterface(p->dev, &IID_ID3D11Device1,
+                                     (void**)&p->dev1);
+    if (SUCCEEDED(hr)) {
+        minor = 1;
+        ID3D11Device1_GetImmediateContext1(p->dev1, &p->ctx1);
+
+        D3D11_FEATURE_DATA_D3D11_OPTIONS fopts = { 0 };
+        hr = ID3D11Device_CheckFeatureSupport(p->dev,
+            D3D11_FEATURE_D3D11_OPTIONS, &fopts, sizeof(fopts));
+        if (SUCCEEDED(hr)) {
+            p->has_clear_view = fopts.ClearView;
+        }
+    }
+
+    MP_VERBOSE(ra, "Using Direct3D 11.%d runtime\n", minor);
+
+    p->fl = ID3D11Device_GetFeatureLevel(p->dev);
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        ra->max_texture_wh = D3D11_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_10_0) {
+        ra->max_texture_wh = D3D10_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+    } else if (p->fl >= D3D_FEATURE_LEVEL_9_3) {
+        ra->max_texture_wh = D3D_FL9_3_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+    } else {
+        ra->max_texture_wh = D3D_FL9_1_REQ_TEXTURE2D_U_OR_V_DIMENSION;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0)
+        ra->caps |= RA_CAP_GATHER;
+    if (p->fl >= D3D_FEATURE_LEVEL_10_0)
+        ra->caps |= RA_CAP_FRAGCOORD;
+
+    // Some 10_0 hardware has compute shaders, but only 11_0 has image load/store
+    if (p->fl >= D3D_FEATURE_LEVEL_11_0) {
+        ra->caps |= RA_CAP_COMPUTE | RA_CAP_BUF_RW;
+        ra->max_shmem = 32 * 1024;
+    }
+
+    if (p->fl >= D3D_FEATURE_LEVEL_11_1) {
+        p->max_uavs = D3D11_1_UAV_SLOT_COUNT;
+    } else {
+        p->max_uavs = D3D11_PS_CS_UAV_REGISTER_COUNT;
+    }
+
+    if (ID3D11Device_GetCreationFlags(p->dev) & D3D11_CREATE_DEVICE_DEBUG)
+        init_debug_layer(ra);
+
+    // Some level 9_x devices don't have timestamp queries
+    hr = ID3D11Device_CreateQuery(p->dev,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_TIMESTAMP }, NULL);
+    p->has_timestamp_queries = SUCCEEDED(hr);
+
+    // According to MSDN, the above texture sizes are just minimums and drivers
+    // may support larger textures. See:
+    // https://msdn.microsoft.com/en-us/library/windows/desktop/ff476874.aspx
+    find_max_texture_dimension(ra);
+    MP_VERBOSE(ra, "Maximum Texture2D size: %dx%d\n", ra->max_texture_wh,
+               ra->max_texture_wh);
+
+    if (!load_d3d_compiler(ra)) {
+        MP_FATAL(ra, "Could not find D3DCompiler DLL\n");
+        goto error;
+    }
+
+    MP_VERBOSE(ra, "D3DCompiler version: %u.%u.%u.%u\n",
+               p->d3d_compiler_ver.major, p->d3d_compiler_ver.minor,
+               p->d3d_compiler_ver.build, p->d3d_compiler_ver.revision);
+
+    setup_formats(ra);
+
+    // The rasterizer state never changes, so set it up here
+    ID3D11RasterizerState *rstate;
+    D3D11_RASTERIZER_DESC rdesc = {
+        .FillMode = D3D11_FILL_SOLID,
+        .CullMode = D3D11_CULL_NONE,
+        .FrontCounterClockwise = FALSE,
+        .DepthClipEnable = TRUE, // Required for 10level9
+        .ScissorEnable = TRUE,
+    };
+    hr = ID3D11Device_CreateRasterizerState(p->dev, &rdesc, &rstate);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to create rasterizer state: %s\n", mp_HRESULT_to_str(hr));
+        goto error;
+    }
+    ID3D11DeviceContext_RSSetState(p->ctx, rstate);
+    SAFE_RELEASE(rstate);
+
+    // If the device doesn't support ClearView, we have to set up a
+    // shader-based clear() implementation
+    if (!p->has_clear_view && !setup_clear_rpass(ra))
+        goto error;
+
+    if (!setup_blit_rpass(ra))
+        goto error;
+
+    return ra;
+
+error:
+    destroy(ra);
+    return NULL;
+}
+
+ID3D11Device *ra_d3d11_get_device(struct ra *ra)
+{
+    struct ra_d3d11 *p = ra->priv;
+    ID3D11Device_AddRef(p->dev);
+    return p->dev;
+}
+
+bool ra_is_d3d11(struct ra *ra)
+{
+    return ra->fns == &ra_fns_d3d11;
+}
diff --git a/video/out/d3d11/ra_d3d11.h b/video/out/d3d11/ra_d3d11.h
new file mode 100644
index 0000000..54033b6
--- /dev/null
+++ b/video/out/d3d11/ra_d3d11.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <stdbool.h>
+#include <windows.h>
+#include <d3d11.h>
+#include <dxgi1_2.h>
+
+#include "video/out/gpu/ra.h"
+#include "video/out/gpu/spirv.h"
+
+// Create an RA instance from a D3D11 device. This takes a reference to the
+// device, which is released when the RA instance is destroyed.
+struct ra *ra_d3d11_create(ID3D11Device *device, struct mp_log *log,
+                           struct spirv_compiler *spirv);
+
+// Flush the immediate context of the wrapped D3D11 device
+void ra_d3d11_flush(struct ra *ra);
+
+// Create an RA texture from a D3D11 resource. This takes a reference to the
+// texture, which is released when the RA texture is destroyed.
+struct ra_tex *ra_d3d11_wrap_tex(struct ra *ra, ID3D11Resource *res);
+
+// As above, but for a D3D11VA video resource. The fmt parameter selects which
+// plane of a planar format will be mapped when the RA texture is used.
+// array_slice should be set for texture arrays and is ignored for non-arrays.
+struct ra_tex *ra_d3d11_wrap_tex_video(struct ra *ra, ID3D11Texture2D *res,
+                                       int w, int h, int array_slice,
+                                       const struct ra_format *fmt);
+
+// Get the underlying D3D11 device from an RA instance. The returned device is
+// refcounted and must be released by the caller.
+ID3D11Device *ra_d3d11_get_device(struct ra *ra);
+
+// True if the RA instance was created with ra_d3d11_create()
+bool ra_is_d3d11(struct ra *ra);
diff --git a/video/out/drm_atomic.c b/video/out/drm_atomic.c
new file mode 100644
index 0000000..7a55483
--- /dev/null
+++ b/video/out/drm_atomic.c
@@ -0,0 +1,245 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+
+#include "common/common.h"
+#include "common/msg.h"
+#include "drm_atomic.h"
+
+int drm_object_create_properties(struct mp_log *log, int fd,
+                                 struct drm_object *object)
+{
+    object->props = drmModeObjectGetProperties(fd, object->id, object->type);
+    if (object->props) {
+        object->props_info = talloc_zero_size(NULL, object->props->count_props
+                                              * sizeof(object->props_info));
+        if (object->props_info) {
+            for (int i = 0; i < object->props->count_props; i++)
+                object->props_info[i] = drmModeGetProperty(fd, object->props->props[i]);
+        } else {
+            mp_err(log, "Out of memory\n");
+            goto fail;
+        }
+    } else {
+        mp_err(log, "Failed to retrieve properties for object id %d\n", object->id);
+        goto fail;
+    }
+
+    return 0;
+
+  fail:
+    drm_object_free_properties(object);
+    return -1;
+}
+
+void drm_object_free_properties(struct drm_object *object)
+{
+    if (object->props) {
+        for (int i = 0; i < object->props->count_props; i++) {
+            if (object->props_info[i]) {
+                drmModeFreeProperty(object->props_info[i]);
+                object->props_info[i] = NULL;
+            }
+        }
+
+        talloc_free(object->props_info);
+        object->props_info = NULL;
+
+        drmModeFreeObjectProperties(object->props);
+        object->props = NULL;
+    }
+}
+
+int drm_object_get_property(struct drm_object *object, char *name, uint64_t *value)
+{
+   for (int i = 0; i < object->props->count_props; i++) {
+       if (strcasecmp(name, object->props_info[i]->name) == 0) {
+           *value = object->props->prop_values[i];
+           return 0;
+       }
+   }
+
+   return -EINVAL;
+}
+
+int drm_object_set_property(drmModeAtomicReq *request, struct drm_object *object,
+                            char *name, uint64_t value)
+{
+   for (int i = 0; i < object->props->count_props; i++) {
+       if (strcasecmp(name, object->props_info[i]->name) == 0) {
+           return drmModeAtomicAddProperty(request, object->id,
+                                           object->props_info[i]->prop_id, value);
+       }
+   }
+
+   return -EINVAL;
+}
+
+struct drm_object * drm_object_create(struct mp_log *log, int fd,
+                                      uint32_t object_id, uint32_t type)
+{
+    struct drm_object *obj = NULL;
+    obj = talloc_zero(NULL, struct drm_object);
+    obj->id = object_id;
+    obj->type = type;
+
+    if (drm_object_create_properties(log, fd, obj)) {
+        talloc_free(obj);
+        return NULL;
+    }
+
+    return obj;
+}
+
+void drm_object_free(struct drm_object *object)
+{
+    if (object) {
+        drm_object_free_properties(object);
+        talloc_free(object);
+    }
+}
+
+void drm_object_print_info(struct mp_log *log, struct drm_object *object)
+{
+    mp_err(log, "Object ID = %d (type = %x) has %d properties\n",
+           object->id, object->type, object->props->count_props);
+
+    for (int i = 0; i < object->props->count_props; i++)
+        mp_err(log, "    Property '%s' = %lld\n", object->props_info[i]->name,
+               (long long)object->props->prop_values[i]);
+}
+
+struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd,
+                                                     int crtc_id, int overlay_id)
+{
+    drmModePlane *drmplane = NULL;
+    drmModePlaneRes *plane_res = NULL;
+    drmModeRes *res = NULL;
+    struct drm_object *plane = NULL;
+    struct drm_atomic_context *ctx;
+    int crtc_index = -1;
+    int layercount = 0;
+    uint64_t value;
+
+    res = drmModeGetResources(fd);
+    if (!res) {
+        mp_err(log, "Cannot retrieve DRM resources: %s\n", mp_strerror(errno));
+        goto fail;
+    }
+
+    plane_res = drmModeGetPlaneResources(fd);
+    if (!plane_res) {
+        mp_err(log, "Cannot retrieve plane ressources: %s\n", mp_strerror(errno));
+        goto fail;
+    }
+
+    ctx = talloc_zero(NULL, struct drm_atomic_context);
+    if (!ctx) {
+        mp_err(log, "Out of memory\n");
+        goto fail;
+    }
+
+    ctx->fd = fd;
+    ctx->crtc = drm_object_create(log, ctx->fd, crtc_id, DRM_MODE_OBJECT_CRTC);
+    if (!ctx->crtc) {
+        mp_err(log, "Failed to create CRTC object\n");
+        goto fail;
+    }
+
+    for (int i = 0; i < res->count_crtcs; i++) {
+        if (res->crtcs[i] == crtc_id) {
+            crtc_index = i;
+            break;
+        }
+    }
+
+    for (unsigned int j = 0; j < plane_res->count_planes; j++) {
+
+        drmplane = drmModeGetPlane (ctx->fd, plane_res->planes[j]);
+        if (drmplane->possible_crtcs & (1 << crtc_index)) {
+            plane = drm_object_create(log, ctx->fd, drmplane->plane_id,
+                                      DRM_MODE_OBJECT_PLANE);
+
+            if (plane) {
+                if (drm_object_get_property(plane, "TYPE", &value) == -EINVAL) {
+                    mp_err(log, "Unable to retrieve type property from plane %d\n", j);
+                    goto fail;
+                } else {
+                    if ((value == DRM_PLANE_TYPE_OVERLAY) &&
+                            (layercount == overlay_id)) {
+                        ctx->overlay_plane = plane;
+                    }
+                    else if (value == DRM_PLANE_TYPE_PRIMARY) {
+                        ctx->primary_plane = plane;
+                    }
+                    else {
+                        drm_object_free(plane);
+                        plane = NULL;
+                    }
+
+                    if (value == DRM_PLANE_TYPE_OVERLAY)
+                        layercount++;
+                }
+            } else {
+                mp_err(log, "Failed to create Plane object from plane ID %d\n",
+                       drmplane->plane_id);
+                goto fail;
+            }
+        }
+        drmModeFreePlane(drmplane);
+        drmplane = NULL;
+    }
+
+    if (!ctx->primary_plane) {
+        mp_err(log, "Failed to find primary plane\n");
+        goto fail;
+    }
+
+    if (!ctx->overlay_plane) {
+        mp_err(log, "Failed to find overlay plane with id=%d\n", overlay_id);
+        goto fail;
+    }
+
+    mp_verbose(log, "Found Primary plane with ID %d, overlay with ID %d\n",
+               ctx->primary_plane->id, ctx->overlay_plane->id);
+
+    drmModeFreePlaneResources(plane_res);
+    drmModeFreeResources(res);
+    return ctx;
+
+
+fail:
+    if (res)
+        drmModeFreeResources(res);
+    if (plane_res)
+        drmModeFreePlaneResources(plane_res);
+    if (drmplane)
+        drmModeFreePlane(drmplane);
+    if (plane)
+        drm_object_free(plane);
+    return NULL;
+}
+
+void drm_atomic_destroy_context(struct drm_atomic_context *ctx)
+{
+    drm_object_free(ctx->crtc);
+    drm_object_free(ctx->primary_plane);
+    drm_object_free(ctx->overlay_plane);
+    talloc_free(ctx);
+}
diff --git a/video/out/drm_atomic.h b/video/out/drm_atomic.h
new file mode 100644
index 0000000..d0ebdb9
--- /dev/null
+++ b/video/out/drm_atomic.h
@@ -0,0 +1,55 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_DRMATOMIC_H
+#define MP_DRMATOMIC_H
+
+#include <stdlib.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#include "common/msg.h"
+
+struct drm_object {
+    uint32_t id;
+    uint32_t type;
+    drmModeObjectProperties *props;
+    drmModePropertyRes **props_info;
+};
+
+struct drm_atomic_context {
+    int fd;
+
+    struct drm_object *crtc;
+    struct drm_object *primary_plane;
+    struct drm_object *overlay_plane;
+
+    drmModeAtomicReq *request;
+};
+
+
+int drm_object_create_properties(struct mp_log *log, int fd, struct drm_object *object);
+void drm_object_free_properties(struct drm_object *object);
+int drm_object_get_property(struct drm_object *object, char *name, uint64_t *value);
+int drm_object_set_property(drmModeAtomicReq *request, struct drm_object *object, char *name, uint64_t value);
+struct drm_object * drm_object_create(struct mp_log *log, int fd, uint32_t object_id, uint32_t type);
+void drm_object_free(struct drm_object *object);
+void drm_object_print_info(struct mp_log *log, struct drm_object *object);
+struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd, int crtc_id, int overlay_id);
+void drm_atomic_destroy_context(struct drm_atomic_context *ctx);
+
+#endif // MP_DRMATOMIC_H
diff --git a/video/out/drm_common.c b/video/out/drm_common.c
index aea4afa..8402ac7 100644
--- a/video/out/drm_common.c
+++ b/video/out/drm_common.c
@@ -41,6 +41,18 @@
 
 static int vt_switcher_pipe[2];
 
+#define OPT_BASE_STRUCT struct drm_opts
+const struct m_sub_options drm_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_STRING_VALIDATE("drm-connector", drm_connector_spec,
+                            0, drm_validate_connector_opt),
+        OPT_INT("drm-mode", drm_mode_id, 0),
+        OPT_INT("drm-overlay", drm_overlay_id, 0),
+        {0},
+    },
+    .size = sizeof(struct drm_opts),
+};
+
 static const char *connector_names[] = {
     "Unknown",   // DRM_MODE_CONNECTOR_Unknown
     "VGA",       // DRM_MODE_CONNECTOR_VGA
@@ -222,7 +234,7 @@ static void parse_connector_spec(struct mp_log *log,
 
 
 struct kms *kms_create(struct mp_log *log, const char *connector_spec,
-                       int mode_id)
+                       int mode_id, int overlay_id)
 {
     int card_no = -1;
     char *connector_name = NULL;
@@ -260,6 +272,23 @@ struct kms *kms_create(struct mp_log *log, const char *connector_spec,
     if (!setup_mode(kms, mode_id))
         goto err;
 
+    // Universal planes allows accessing all the planes (including primary)
+    if (drmSetClientCap(kms->fd, DRM_CLIENT_CAP_UNIVERSAL_PLANES, 1)) {
+        mp_err(log, "Failed to set Universal planes capability\n");
+    }
+
+    if (drmSetClientCap(kms->fd, DRM_CLIENT_CAP_ATOMIC, 1)) {
+        mp_verbose(log, "No DRM Atomic support found\n");
+    } else {
+        mp_verbose(log, "DRM Atomic support found\n");
+        kms->atomic_context = drm_atomic_create_context(kms->log, kms->fd, kms->crtc_id, overlay_id);
+        if (!kms->atomic_context) {
+            mp_err(log, "Failed to create DRM atomic context\n");
+            goto err;
+        }
+    }
+
+
     drmModeFreeResources(res);
     return kms;
 
@@ -284,6 +313,10 @@ void kms_destroy(struct kms *kms)
         drmModeFreeEncoder(kms->encoder);
         kms->encoder = NULL;
     }
+    if (kms->atomic_context) {
+       drm_atomic_destroy_context(kms->atomic_context);
+    }
+
     close(kms->fd);
     talloc_free(kms);
 }
diff --git a/video/out/drm_common.h b/video/out/drm_common.h
index 6796472..ff913ff 100644
--- a/video/out/drm_common.h
+++ b/video/out/drm_common.h
@@ -22,6 +22,7 @@
 #include <xf86drm.h>
 #include <xf86drmMode.h>
 #include "options/m_option.h"
+#include "drm_atomic.h"
 
 struct kms {
     struct mp_log *log;
@@ -31,6 +32,7 @@ struct kms {
     drmModeModeInfo mode;
     uint32_t crtc_id;
     int card_no;
+    struct drm_atomic_context *atomic_context;
 };
 
 struct vt_switcher {
@@ -40,6 +42,12 @@ struct vt_switcher {
     void *handler_data[2];
 };
 
+struct drm_opts {
+    char *drm_connector_spec;
+    int drm_mode_id;
+    int drm_overlay_id;
+};
+
 bool vt_switcher_init(struct vt_switcher *s, struct mp_log *log);
 void vt_switcher_destroy(struct vt_switcher *s);
 void vt_switcher_poll(struct vt_switcher *s, int timeout_ms);
@@ -51,7 +59,7 @@ void vt_switcher_release(struct vt_switcher *s, void (*handler)(void*),
                          void *user_data);
 
 struct kms *kms_create(struct mp_log *log, const char *connector_spec,
-                       int mode_id);
+                       int mode_id, int overlay_id);
 void kms_destroy(struct kms *kms);
 double kms_get_display_fps(const struct kms *kms);
 
diff --git a/video/out/drm_prime.c b/video/out/drm_prime.c
new file mode 100644
index 0000000..253fbb6
--- /dev/null
+++ b/video/out/drm_prime.c
@@ -0,0 +1,85 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <unistd.h>
+#include <xf86drm.h>
+#include <xf86drmMode.h>
+
+#include "common/msg.h"
+#include "drm_common.h"
+#include "drm_prime.h"
+
+int drm_prime_create_framebuffer(struct mp_log *log, int fd, AVDRMFrameDescriptor *descriptor, int width, int height,
+                                  struct  drm_prime_framebuffer *framebuffer)
+{
+    AVDRMLayerDescriptor *layer = NULL;
+    uint32_t pitches[4], offsets[4], handles[4];
+    int ret, layer_fd;
+
+    if (descriptor && descriptor->nb_layers) {
+        *framebuffer = (struct drm_prime_framebuffer){0};
+
+        for (int object = 0; object < descriptor->nb_objects; object++) {
+            ret = drmPrimeFDToHandle(fd, descriptor->objects[object].fd, &framebuffer->gem_handles[object]);
+            if (ret < 0) {
+                mp_err(log, "Failed to retrieve the Prime Handle from handle %d (%d).\n", object, descriptor->objects[object].fd);
+                goto fail;
+            }
+        }
+
+        layer = &descriptor->layers[0];
+
+        for (int plane = 0; plane < AV_DRM_MAX_PLANES; plane++) {
+            layer_fd = framebuffer->gem_handles[layer->planes[plane].object_index];
+            if (layer_fd && layer->planes[plane].pitch) {
+                pitches[plane] = layer->planes[plane].pitch;
+                offsets[plane] = layer->planes[plane].offset;
+                handles[plane] = layer_fd;
+            } else {
+                pitches[plane] = 0;
+                offsets[plane] = 0;
+                handles[plane] = 0;
+            }
+        }
+
+        ret = drmModeAddFB2(fd, width, height, layer->format,
+                            handles, pitches, offsets, &framebuffer->fb_id, 0);
+        if (ret < 0) {
+            mp_err(log, "Failed to create framebuffer on layer %d.\n", 0);
+            goto fail;
+        }
+    }
+
+   return 0;
+
+fail:
+   memset(framebuffer, 0, sizeof(*framebuffer));
+   return -1;
+}
+
+void drm_prime_destroy_framebuffer(struct mp_log *log, int fd, struct  drm_prime_framebuffer *framebuffer)
+{
+    if (framebuffer->fb_id)
+        drmModeRmFB(fd, framebuffer->fb_id);
+
+    for (int i = 0; i < AV_DRM_MAX_PLANES; i++) {
+        if (framebuffer->gem_handles[i])
+            drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, &framebuffer->gem_handles[i]);
+    }
+
+    memset(framebuffer, 0, sizeof(*framebuffer));
+}
diff --git a/video/out/win32/exclusive_hack.h b/video/out/drm_prime.h
index 883e215..0653fdb 100644
--- a/video/out/win32/exclusive_hack.h
+++ b/video/out/drm_prime.h
@@ -15,12 +15,19 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#ifndef MP_WIN32_EXCLUSIVE_HACK_H_
-#define MP_WIN32_EXCLUSIVE_HACK_H_
+#ifndef DRM_PRIME_H
+#define DRM_PRIME_H
 
-#include <stdbool.h>
+#include <libavutil/hwcontext_drm.h>
 
-// Returns true if any program on the computer is in exclusive fullscreen mode
-bool mp_w32_is_in_exclusive_mode(void);
+#include "common/msg.h"
 
-#endif
+struct drm_prime_framebuffer {
+    uint32_t fb_id;
+    uint32_t gem_handles[AV_DRM_MAX_PLANES];
+};
+
+int drm_prime_create_framebuffer(struct mp_log *log, int fd, AVDRMFrameDescriptor *descriptor, int width, int height,
+                                  struct  drm_prime_framebuffer *framebuffers);
+void drm_prime_destroy_framebuffer(struct mp_log *log, int fd, struct  drm_prime_framebuffer *framebuffers);
+#endif // DRM_PRIME_H
diff --git a/video/out/gpu/context.c b/video/out/gpu/context.c
new file mode 100644
index 0000000..36f9c2d
--- /dev/null
+++ b/video/out/gpu/context.c
@@ -0,0 +1,223 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config.h"
+#include "common/common.h"
+#include "common/msg.h"
+#include "options/options.h"
+#include "options/m_option.h"
+#include "video/out/vo.h"
+
+#include "context.h"
+#include "spirv.h"
+
+/* OpenGL */
+extern const struct ra_ctx_fns ra_ctx_glx;
+extern const struct ra_ctx_fns ra_ctx_glx_probe;
+extern const struct ra_ctx_fns ra_ctx_x11_egl;
+extern const struct ra_ctx_fns ra_ctx_drm_egl;
+extern const struct ra_ctx_fns ra_ctx_cocoa;
+extern const struct ra_ctx_fns ra_ctx_wayland_egl;
+extern const struct ra_ctx_fns ra_ctx_wgl;
+extern const struct ra_ctx_fns ra_ctx_angle;
+extern const struct ra_ctx_fns ra_ctx_dxgl;
+extern const struct ra_ctx_fns ra_ctx_rpi;
+extern const struct ra_ctx_fns ra_ctx_android;
+extern const struct ra_ctx_fns ra_ctx_mali_fbdev;
+extern const struct ra_ctx_fns ra_ctx_vdpauglx;
+
+/* Vulkan */
+extern const struct ra_ctx_fns ra_ctx_vulkan_wayland;
+extern const struct ra_ctx_fns ra_ctx_vulkan_win;
+extern const struct ra_ctx_fns ra_ctx_vulkan_xlib;
+
+/* Direct3D 11 */
+extern const struct ra_ctx_fns ra_ctx_d3d11;
+
+static const struct ra_ctx_fns *contexts[] = {
+#if HAVE_D3D11
+    &ra_ctx_d3d11,
+#endif
+
+// OpenGL contexts:
+#if HAVE_ANDROID
+    &ra_ctx_android,
+#endif
+#if HAVE_RPI
+    &ra_ctx_rpi,
+#endif
+#if HAVE_GL_COCOA
+    &ra_ctx_cocoa,
+#endif
+#if HAVE_EGL_ANGLE_WIN32
+    &ra_ctx_angle,
+#endif
+#if HAVE_GL_WIN32
+    &ra_ctx_wgl,
+#endif
+#if HAVE_GL_DXINTEROP
+    &ra_ctx_dxgl,
+#endif
+#if HAVE_GL_X11
+    &ra_ctx_glx_probe,
+#endif
+#if HAVE_EGL_X11
+    &ra_ctx_x11_egl,
+#endif
+#if HAVE_GL_X11
+    &ra_ctx_glx,
+#endif
+#if HAVE_GL_WAYLAND
+    &ra_ctx_wayland_egl,
+#endif
+#if HAVE_EGL_DRM
+    &ra_ctx_drm_egl,
+#endif
+#if HAVE_MALI_FBDEV
+    &ra_ctx_mali_fbdev,
+#endif
+#if HAVE_VDPAU_GL_X11
+    &ra_ctx_vdpauglx,
+#endif
+
+// Vulkan contexts:
+#if HAVE_VULKAN
+
+#if HAVE_WIN32_DESKTOP
+    &ra_ctx_vulkan_win,
+#endif
+#if HAVE_WAYLAND
+    &ra_ctx_vulkan_wayland,
+#endif
+#if HAVE_X11
+    &ra_ctx_vulkan_xlib,
+#endif
+
+#endif
+};
+
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param)
+{
+    if (bstr_equals0(param, "help")) {
+        mp_info(log, "GPU APIs (contexts):\n");
+        mp_info(log, "    auto (autodetect)\n");
+        for (int n = 0; n < MP_ARRAY_SIZE(contexts); n++)
+            mp_info(log, "    %s (%s)\n", contexts[n]->type, contexts[n]->name);
+        return M_OPT_EXIT;
+    }
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->type))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param)
+{
+    if (bstr_equals0(param, "help")) {
+        mp_info(log, "GPU contexts (APIs):\n");
+        mp_info(log, "    auto (autodetect)\n");
+        for (int n = 0; n < MP_ARRAY_SIZE(contexts); n++)
+            mp_info(log, "    %s (%s)\n", contexts[n]->name, contexts[n]->type);
+        return M_OPT_EXIT;
+    }
+    if (bstr_equals0(param, "auto"))
+        return 1;
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (bstr_equals0(param, contexts[i]->name))
+            return 1;
+    }
+    return M_OPT_INVALID;
+}
+
+// Create a VO window and create a RA context on it.
+//  vo_flags: passed to the backend's create window function
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts)
+{
+    bool api_auto = !context_type || strcmp(context_type, "auto") == 0;
+    bool ctx_auto = !context_name || strcmp(context_name, "auto") == 0;
+
+    if (ctx_auto) {
+        MP_VERBOSE(vo, "Probing for best GPU context.\n");
+        opts.probing = true;
+    }
+
+    // Hack to silence backend (X11/Wayland/etc.) errors. Kill it once backends
+    // are separate from `struct vo`
+    bool old_probing = vo->probing;
+    vo->probing = opts.probing;
+
+    for (int i = 0; i < MP_ARRAY_SIZE(contexts); i++) {
+        if (!opts.probing && strcmp(contexts[i]->name, context_name) != 0)
+            continue;
+        if (!api_auto && strcmp(contexts[i]->type, context_type) != 0)
+            continue;
+
+        struct ra_ctx *ctx = talloc_ptrtype(NULL, ctx);
+        *ctx = (struct ra_ctx) {
+            .vo = vo,
+            .global = vo->global,
+            .log = mp_log_new(ctx, vo->log, contexts[i]->type),
+            .opts = opts,
+            .fns = contexts[i],
+        };
+
+        MP_VERBOSE(ctx, "Initializing GPU context '%s'\n", ctx->fns->name);
+        if (contexts[i]->init(ctx)) {
+            vo->probing = old_probing;
+            return ctx;
+        }
+
+        talloc_free(ctx);
+    }
+
+    vo->probing = old_probing;
+
+    // If we've reached this point, then none of the contexts matched the name
+    // requested, or the backend creation failed for all of them.
+    if (!vo->probing)
+        MP_ERR(vo, "Failed initializing any suitable GPU context!\n");
+    return NULL;
+}
+
+void ra_ctx_destroy(struct ra_ctx **ctx_ptr)
+{
+    struct ra_ctx *ctx = *ctx_ptr;
+    if (!ctx)
+        return;
+
+    if (ctx->spirv && ctx->spirv->fns->uninit)
+        ctx->spirv->fns->uninit(ctx);
+
+    ctx->fns->uninit(ctx);
+    talloc_free(ctx);
+
+    *ctx_ptr = NULL;
+}
diff --git a/video/out/gpu/context.h b/video/out/gpu/context.h
new file mode 100644
index 0000000..78c0441
--- /dev/null
+++ b/video/out/gpu/context.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#include "video/out/vo.h"
+
+#include "config.h"
+#include "ra.h"
+
+struct ra_ctx_opts {
+    int allow_sw;        // allow software renderers
+    int want_alpha;      // create an alpha framebuffer if possible
+    int debug;           // enable debugging layers/callbacks etc.
+    bool probing;        // the backend was auto-probed
+    int swapchain_depth; // max number of images to render ahead
+};
+
+struct ra_ctx {
+    struct vo *vo;
+    struct ra *ra;
+    struct mpv_global *global;
+    struct mp_log *log;
+
+    struct ra_ctx_opts opts;
+    const struct ra_ctx_fns *fns;
+    struct ra_swapchain *swapchain;
+    struct spirv_compiler *spirv;
+
+    void *priv;
+};
+
+// The functions that make up a ra_ctx.
+struct ra_ctx_fns {
+    const char *type; // API type (for --gpu-api)
+    const char *name; // name (for --gpu-context)
+
+    // Resize the window, or create a new window if there isn't one yet.
+    // Currently, there is an unfortunate interaction with ctx->vo, and
+    // display size etc. are determined by it.
+    bool (*reconfig)(struct ra_ctx *ctx);
+
+    // This behaves exactly like vo_driver.control().
+    int (*control)(struct ra_ctx *ctx, int *events, int request, void *arg);
+
+    // These behave exactly like vo_driver.wakeup/wait_events. They are
+    // optional.
+    void (*wakeup)(struct ra_ctx *ctx);
+    void (*wait_events)(struct ra_ctx *ctx, int64_t until_time_us);
+
+    // Initialize/destroy the 'struct ra' and possibly the underlying VO backend.
+    // Not normally called by the user of the ra_ctx.
+    bool (*init)(struct ra_ctx *ctx);
+    void (*uninit)(struct ra_ctx *ctx);
+};
+
+// Extra struct for the swapchain-related functions so they can be easily
+// inherited from helpers.
+struct ra_swapchain {
+    struct ra_ctx *ctx;
+    struct priv *priv;
+    const struct ra_swapchain_fns *fns;
+};
+
+// Represents a framebuffer / render target
+struct ra_fbo {
+    struct ra_tex *tex;
+    bool flip; // rendering needs to be inverted
+};
+
+struct ra_swapchain_fns {
+    // Gets the current framebuffer depth in bits (0 if unknown). Optional.
+    int (*color_depth)(struct ra_swapchain *sw);
+
+    // Retrieves a screenshot of the framebuffer. Optional.
+    struct mp_image *(*screenshot)(struct ra_swapchain *sw);
+
+    // Called when rendering starts. Returns NULL on failure. This must be
+    // followed by submit_frame, to submit the rendered frame. This function
+    // can also fail sporadically, and such errors should be ignored unless
+    // they persist.
+    bool (*start_frame)(struct ra_swapchain *sw, struct ra_fbo *out_fbo);
+
+    // Present the frame. Issued in lockstep with start_frame, with rendering
+    // commands in between. The `frame` is just there for timing data, for
+    // swapchains smart enough to do something with it.
+    bool (*submit_frame)(struct ra_swapchain *sw, const struct vo_frame *frame);
+
+    // Performs a buffer swap. This blocks for as long as necessary to meet
+    // params.swapchain_depth, or until the next vblank (for vsynced contexts)
+    void (*swap_buffers)(struct ra_swapchain *sw);
+};
+
+// Create and destroy a ra_ctx. This also takes care of creating and destroying
+// the underlying `struct ra`, and perhaps the underlying VO backend.
+struct ra_ctx *ra_ctx_create(struct vo *vo, const char *context_type,
+                             const char *context_name, struct ra_ctx_opts opts);
+void ra_ctx_destroy(struct ra_ctx **ctx);
+
+struct m_option;
+int ra_ctx_validate_api(struct mp_log *log, const struct m_option *opt,
+                        struct bstr name, struct bstr param);
+int ra_ctx_validate_context(struct mp_log *log, const struct m_option *opt,
+                            struct bstr name, struct bstr param);
diff --git a/video/out/opengl/d3d11_helpers.c b/video/out/gpu/d3d11_helpers.c
index d9b7fc2..b96b03a 100644
--- a/video/out/opengl/d3d11_helpers.c
+++ b/video/out/gpu/d3d11_helpers.c
@@ -46,6 +46,8 @@ static int get_feature_levels(int max_fl, int min_fl,
                               const D3D_FEATURE_LEVEL **out)
 {
     static const D3D_FEATURE_LEVEL levels[] = {
+        D3D_FEATURE_LEVEL_12_1,
+        D3D_FEATURE_LEVEL_12_0,
         D3D_FEATURE_LEVEL_11_1,
         D3D_FEATURE_LEVEL_11_0,
         D3D_FEATURE_LEVEL_10_1,
@@ -70,7 +72,7 @@ static int get_feature_levels(int max_fl, int min_fl,
     return len;
 }
 
-static HRESULT create_device(struct mp_log *log, bool warp, bool bgra,
+static HRESULT create_device(struct mp_log *log, bool warp, bool debug,
                              int max_fl, int min_fl, ID3D11Device **dev)
 {
     const D3D_FEATURE_LEVEL *levels;
@@ -82,7 +84,7 @@ static HRESULT create_device(struct mp_log *log, bool warp, bool bgra,
 
     D3D_DRIVER_TYPE type = warp ? D3D_DRIVER_TYPE_WARP
                                 : D3D_DRIVER_TYPE_HARDWARE;
-    UINT flags = bgra ? D3D11_CREATE_DEVICE_BGRA_SUPPORT : 0;
+    UINT flags = debug ? D3D11_CREATE_DEVICE_DEBUG : 0;
     return pD3D11CreateDevice(NULL, type, NULL, flags, levels, levels_len,
         D3D11_SDK_VERSION, dev, NULL, NULL);
 }
@@ -95,7 +97,6 @@ bool mp_d3d11_create_present_device(struct mp_log *log,
                                     ID3D11Device **dev_out)
 {
     bool warp = opts->force_warp;
-    bool bgra = true;
     int max_fl = opts->max_feature_level;
     int min_fl = opts->min_feature_level;
     ID3D11Device *dev = NULL;
@@ -116,25 +117,27 @@ bool mp_d3d11_create_present_device(struct mp_log *log,
         max_fl = max_fl ? max_fl : D3D_FEATURE_LEVEL_11_0;
         min_fl = min_fl ? min_fl : D3D_FEATURE_LEVEL_9_1;
 
-        hr = create_device(log, warp, bgra, max_fl, min_fl, &dev);
+        hr = create_device(log, warp, opts->debug, max_fl, min_fl, &dev);
         if (SUCCEEDED(hr))
             break;
 
-        // BGRA is recommended, but FL 10_0 hardware may not support it
-        if (bgra) {
-            mp_dbg(log, "Failed to create D3D device with BGRA support\n");
-            bgra = false;
+        // Trying to create a D3D_FEATURE_LEVEL_12_0 device on Windows 8.1 or
+        // below will not succeed. Try an 11_1 device.
+        if (max_fl >= D3D_FEATURE_LEVEL_12_0 &&
+            min_fl <= D3D_FEATURE_LEVEL_11_1)
+        {
+            mp_dbg(log, "Failed to create 12_0+ device, trying 11_1\n");
+            max_fl = D3D_FEATURE_LEVEL_11_1;
             continue;
         }
 
         // Trying to create a D3D_FEATURE_LEVEL_11_1 device on Windows 7
-        // without the platform update will not succeed. Try a 11_0 device.
+        // without the platform update will not succeed. Try an 11_0 device.
         if (max_fl >= D3D_FEATURE_LEVEL_11_1 &&
             min_fl <= D3D_FEATURE_LEVEL_11_0)
         {
             mp_dbg(log, "Failed to create 11_1+ device, trying 11_0\n");
             max_fl = D3D_FEATURE_LEVEL_11_0;
-            bgra = true;
             continue;
         }
 
@@ -144,7 +147,6 @@ bool mp_d3d11_create_present_device(struct mp_log *log,
             warp = true;
             max_fl = opts->max_feature_level;
             min_fl = opts->min_feature_level;
-            bgra = true;
             continue;
         }
 
@@ -179,11 +181,13 @@ bool mp_d3d11_create_present_device(struct mp_log *log,
                (((unsigned)selected_level) >> 8) & 0xf);
 
     char *dev_name = mp_to_utf8(NULL, desc.Description);
-    mp_verbose(log, "Device: %s\n"
-                    "VendorId: 0x%04d\n"
-                    "DeviceId: 0x%04d\n"
+    mp_verbose(log, "Device Name: %s\n"
+                    "Device ID: %04x:%04x (rev %02x)\n"
+                    "Subsystem ID: %04x:%04x\n"
                     "LUID: %08lx%08lx\n",
-               dev_name, desc.VendorId, desc.DeviceId,
+               dev_name,
+               desc.VendorId, desc.DeviceId, desc.Revision,
+               LOWORD(desc.SubSysId), HIWORD(desc.SubSysId),
                desc.AdapterLuid.HighPart, desc.AdapterLuid.LowPart);
     talloc_free(dev_name);
 
@@ -381,3 +385,84 @@ done:
     SAFE_RELEASE(dxgi_dev);
     return success;
 }
+
+struct mp_image *mp_d3d11_screenshot(IDXGISwapChain *swapchain)
+{
+    ID3D11Device *dev = NULL;
+    ID3D11DeviceContext *ctx = NULL;
+    ID3D11Texture2D *frontbuffer = NULL;
+    ID3D11Texture2D *staging = NULL;
+    struct mp_image *img = NULL;
+    HRESULT hr;
+
+    // Validate the swap chain. This screenshot method will only work on DXGI
+    // 1.2+ flip/sequential swap chains. It's probably not possible at all with
+    // discard swap chains, since by definition, the backbuffer contents is
+    // discarded on Present().
+    DXGI_SWAP_CHAIN_DESC scd;
+    hr = IDXGISwapChain_GetDesc(swapchain, &scd);
+    if (FAILED(hr))
+        goto done;
+    if (scd.SwapEffect != DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL)
+        goto done;
+
+    // Get the last buffer that was presented with Present(). This should be
+    // the n-1th buffer for a swap chain of length n.
+    hr = IDXGISwapChain_GetBuffer(swapchain, scd.BufferCount - 1,
+        &IID_ID3D11Texture2D, (void**)&frontbuffer);
+    if (FAILED(hr))
+        goto done;
+
+    ID3D11Texture2D_GetDevice(frontbuffer, &dev);
+    ID3D11Device_GetImmediateContext(dev, &ctx);
+
+    D3D11_TEXTURE2D_DESC td;
+    ID3D11Texture2D_GetDesc(frontbuffer, &td);
+    if (td.SampleDesc.Count > 1)
+        goto done;
+
+    // Validate the backbuffer format and convert to an mpv IMGFMT
+    enum mp_imgfmt fmt;
+    switch (td.Format) {
+    case DXGI_FORMAT_B8G8R8A8_UNORM: fmt = IMGFMT_BGR0; break;
+    case DXGI_FORMAT_R8G8B8A8_UNORM: fmt = IMGFMT_RGB0; break;
+    default:
+        goto done;
+    }
+
+    // Create a staging texture based on the frontbuffer with CPU access
+    td.BindFlags = 0;
+    td.MiscFlags = 0;
+    td.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+    td.Usage = D3D11_USAGE_STAGING;
+    hr = ID3D11Device_CreateTexture2D(dev, &td, 0, &staging);
+    if (FAILED(hr))
+        goto done;
+
+    ID3D11DeviceContext_CopyResource(ctx, (ID3D11Resource*)staging,
+        (ID3D11Resource*)frontbuffer);
+
+    // Attempt to map the staging texture to CPU-accessible memory
+    D3D11_MAPPED_SUBRESOURCE lock;
+    hr = ID3D11DeviceContext_Map(ctx, (ID3D11Resource*)staging, 0,
+                                 D3D11_MAP_READ, 0, &lock);
+    if (FAILED(hr))
+        goto done;
+
+    img = mp_image_alloc(fmt, td.Width, td.Height);
+    if (!img)
+        return NULL;
+    for (int i = 0; i < td.Height; i++) {
+        memcpy(img->planes[0] + img->stride[0] * i,
+               (char*)lock.pData + lock.RowPitch * i, td.Width * 4);
+    }
+
+    ID3D11DeviceContext_Unmap(ctx, (ID3D11Resource*)staging, 0);
+
+done:
+    SAFE_RELEASE(frontbuffer);
+    SAFE_RELEASE(staging);
+    SAFE_RELEASE(ctx);
+    SAFE_RELEASE(dev);
+    return img;
+}
diff --git a/video/out/opengl/d3d11_helpers.h b/video/out/gpu/d3d11_helpers.h
index f34d1d4..481c183 100644
--- a/video/out/opengl/d3d11_helpers.h
+++ b/video/out/gpu/d3d11_helpers.h
@@ -23,7 +23,15 @@
 #include <d3d11.h>
 #include <dxgi1_2.h>
 
+#include "video/mp_image.h"
+
+#define D3D_FEATURE_LEVEL_12_0 (0xc000)
+#define D3D_FEATURE_LEVEL_12_1 (0xc100)
+
 struct d3d11_device_opts {
+    // Enable the debug layer (D3D11_CREATE_DEVICE_DEBUG)
+    bool debug;
+
     // Allow a software (WARP) adapter. Note, sometimes a software adapter will
     // be used even when allow_warp is false. This is because, on Windows 8 and
     // up, if there are no hardware adapters, Windows will pretend the WARP
@@ -70,4 +78,6 @@ bool mp_d3d11_create_swapchain(ID3D11Device *dev, struct mp_log *log,
                                struct d3d11_swapchain_opts *opts,
                                IDXGISwapChain **swapchain_out);
 
+struct mp_image *mp_d3d11_screenshot(IDXGISwapChain *swapchain);
+
 #endif
diff --git a/video/out/opengl/hwdec.c b/video/out/gpu/hwdec.c
index 5fbc1aa..5284116 100644
--- a/video/out/opengl/hwdec.c
+++ b/video/out/gpu/hwdec.c
@@ -34,19 +34,16 @@ extern const struct ra_hwdec_driver ra_hwdec_d3d11egl;
 extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
 extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
 extern const struct ra_hwdec_driver ra_hwdec_dxva2;
+extern const struct ra_hwdec_driver ra_hwdec_d3d11va;
 extern const struct ra_hwdec_driver ra_hwdec_cuda;
+extern const struct ra_hwdec_driver ra_hwdec_cuda_nvdec;
 extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
+extern const struct ra_hwdec_driver ra_hwdec_drmprime_drm;
 
-static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
+const struct ra_hwdec_driver *const ra_hwdec_drivers[] = {
 #if HAVE_VAAPI_EGL
     &ra_hwdec_vaegl,
 #endif
-#if HAVE_VAAPI_GLX
-    &ra_hwdec_vaglx,
-#endif
-#if HAVE_VDPAU_GL_X11
-    &ra_hwdec_vdpau,
-#endif
 #if HAVE_VIDEOTOOLBOX_GL || HAVE_IOS_GL
     &ra_hwdec_videotoolbox,
 #endif
@@ -56,6 +53,9 @@ static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
  #if HAVE_D3D9_HWACCEL
     &ra_hwdec_dxva2egl,
  #endif
+ #if HAVE_D3D11
+    &ra_hwdec_d3d11va,
+ #endif
 #endif
 #if HAVE_GL_DXINTEROP_D3D9
     &ra_hwdec_dxva2gldx,
@@ -63,17 +63,24 @@ static const struct ra_hwdec_driver *const mpgl_hwdec_drivers[] = {
 #if HAVE_CUDA_HWACCEL
     &ra_hwdec_cuda,
 #endif
+#if HAVE_VDPAU_GL_X11
+    &ra_hwdec_vdpau,
+#endif
 #if HAVE_RPI
     &ra_hwdec_rpi_overlay,
 #endif
+#if HAVE_DRMPRIME && HAVE_DRM
+    &ra_hwdec_drmprime_drm,
+#endif
+
     NULL
 };
 
-static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
-                                          struct mpv_global *global,
-                                          struct mp_hwdec_devices *devs,
-                                          const struct ra_hwdec_driver *drv,
-                                          bool is_auto)
+struct ra_hwdec *ra_hwdec_load_driver(struct ra *ra, struct mp_log *log,
+                                      struct mpv_global *global,
+                                      struct mp_hwdec_devices *devs,
+                                      const struct ra_hwdec_driver *drv,
+                                      bool is_auto)
 {
     struct ra_hwdec *hwdec = talloc(NULL, struct ra_hwdec);
     *hwdec = (struct ra_hwdec) {
@@ -94,81 +101,31 @@ static struct ra_hwdec *load_hwdec_driver(struct mp_log *log, struct ra *ra,
     return hwdec;
 }
 
-struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
-                                   struct mpv_global *g,
-                                   struct mp_hwdec_devices *devs,
-                                   enum hwdec_type api)
-{
-    bool is_auto = HWDEC_IS_AUTO(api);
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if ((is_auto || api == drv->api) && !drv->testing_only) {
-            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, is_auto);
-            if (r)
-                return r;
-        }
-    }
-    return NULL;
-}
-
-// Load by option name.
-struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
-                               struct mpv_global *g,
-                               struct mp_hwdec_devices *devs,
-                               const char *name)
-{
-    int g_hwdec_api;
-    mp_read_option_raw(g, "hwdec", &m_option_type_choice, &g_hwdec_api);
-    if (!name || !name[0])
-        name = m_opt_choice_str(mp_hwdec_names, g_hwdec_api);
-
-    int api_id = HWDEC_NONE;
-    for (int n = 0; mp_hwdec_names[n].name; n++) {
-        if (name && strcmp(mp_hwdec_names[n].name, name) == 0)
-            api_id = mp_hwdec_names[n].value;
-    }
-
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        if (name && strcmp(drv->name, name) == 0) {
-            struct ra_hwdec *r = load_hwdec_driver(log, ra, g, devs, drv, false);
-            if (r)
-                return r;
-        }
-    }
-
-    return ra_hwdec_load_api(log, ra, g, devs, api_id);
-}
-
 int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
                           struct bstr name, struct bstr param)
 {
     bool help = bstr_equals0(param, "help");
     if (help)
         mp_info(log, "Available hwdecs:\n");
-    for (int n = 0; mpgl_hwdec_drivers[n]; n++) {
-        const struct ra_hwdec_driver *drv = mpgl_hwdec_drivers[n];
-        const char *api_name = m_opt_choice_str(mp_hwdec_names, drv->api);
+    for (int n = 0; ra_hwdec_drivers[n]; n++) {
+        const struct ra_hwdec_driver *drv = ra_hwdec_drivers[n];
         if (help) {
-            mp_info(log, "    %s [%s]\n", drv->name, api_name);
-        } else if (bstr_equals0(param, drv->name) ||
-                   bstr_equals0(param, api_name))
-        {
+            mp_info(log, "    %s\n", drv->name);
+        } else if (bstr_equals0(param, drv->name)) {
             return 1;
         }
     }
     if (help) {
-        mp_info(log, "    auto (loads best)\n"
-                     "    (other --hwdec values)\n"
-                     "Setting an empty string means use --hwdec.\n");
+        mp_info(log, "    auto (behavior depends on context)\n"
+                     "    all (load all hwdecs)\n"
+                     "    no (do not load any and block loading on demand)\n");
         return M_OPT_EXIT;
     }
     if (!param.len)
         return 1; // "" is treated specially
-    for (int n = 0; mp_hwdec_names[n].name; n++) {
-        if (bstr_equals0(param, mp_hwdec_names[n].name))
-            return 1;
-    }
+    if (bstr_equals0(param, "all") || bstr_equals0(param, "auto") ||
+        bstr_equals0(param, "no"))
+        return 1;
     mp_fatal(log, "No hwdec backend named '%.*s' found!\n", BSTR_P(param));
     return M_OPT_INVALID;
 }
diff --git a/video/out/opengl/hwdec.h b/video/out/gpu/hwdec.h
index 20bbaae..258ab88 100644
--- a/video/out/opengl/hwdec.h
+++ b/video/out/gpu/hwdec.h
@@ -72,17 +72,14 @@ struct ra_hwdec_mapper_driver {
 };
 
 struct ra_hwdec_driver {
-    // Name of the interop backend. This is used for informational purposes only.
+    // Name of the interop backend. This is used for informational purposes and
+    // for use with debugging options.
     const char *name;
     // Used to create ra_hwdec.priv.
     size_t priv_size;
-    // Used to explicitly request a specific API.
-    enum hwdec_type api;
     // One of the hardware surface IMGFMT_ that must be passed to map_image later.
     // Terminated with a 0 entry. (Extend the array size as needed.)
     const int imgfmts[3];
-    // Dosn't load this unless requested by name.
-    bool testing_only;
 
     // Create the hwdec device. It must add it to hw->devs, if applicable.
     int (*init)(struct ra_hwdec *hw);
@@ -104,15 +101,13 @@ struct ra_hwdec_driver {
                          struct mp_rect *src, struct mp_rect *dst, bool newframe);
 };
 
-struct ra_hwdec *ra_hwdec_load_api(struct mp_log *log, struct ra *ra,
-                                   struct mpv_global *g,
-                                   struct mp_hwdec_devices *devs,
-                                   enum hwdec_type api);
+extern const struct ra_hwdec_driver *const ra_hwdec_drivers[];
 
-struct ra_hwdec *ra_hwdec_load(struct mp_log *log, struct ra *ra,
-                               struct mpv_global *g,
-                               struct mp_hwdec_devices *devs,
-                               const char *name);
+struct ra_hwdec *ra_hwdec_load_driver(struct ra *ra, struct mp_log *log,
+                                      struct mpv_global *global,
+                                      struct mp_hwdec_devices *devs,
+                                      const struct ra_hwdec_driver *drv,
+                                      bool is_auto);
 
 int ra_hwdec_validate_opt(struct mp_log *log, const m_option_t *opt,
                           struct bstr name, struct bstr param);
diff --git a/video/out/opengl/lcms.c b/video/out/gpu/lcms.c
index 8747ae6..3552351 100644
--- a/video/out/opengl/lcms.c
+++ b/video/out/gpu/lcms.c
@@ -236,7 +236,7 @@ static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
         }
 
         // Otherwise, warn the user and generate the profile as usual
-        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring..\n");
+        MP_WARN(p, "Video contained an invalid ICC profile! Ignoring...\n");
     }
 
     // The input profile for the transformation is dependent on the video
diff --git a/video/out/opengl/lcms.h b/video/out/gpu/lcms.h
index 35bbd61..35bbd61 100644
--- a/video/out/opengl/lcms.h
+++ b/video/out/gpu/lcms.h
diff --git a/video/out/opengl/osd.c b/video/out/gpu/osd.c
index f7c325d..317deb6 100644
--- a/video/out/opengl/osd.c
+++ b/video/out/gpu/osd.c
@@ -47,7 +47,6 @@ static const struct ra_renderpass_input vertex_vao[] = {
     {"position",  RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, position)},
     {"texcoord" , RA_VARTYPE_FLOAT,      2, 1, offsetof(struct vertex, texcoord)},
     {"ass_color", RA_VARTYPE_BYTE_UNORM, 4, 1, offsetof(struct vertex, ass_color)},
-    {0}
 };
 
 struct mpgl_osd_part {
@@ -231,8 +230,6 @@ bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
         abort();
     }
 
-    gl_sc_set_vertex_format(sc, vertex_vao, sizeof(struct vertex));
-
     return true;
 }
 
@@ -256,8 +253,8 @@ static void write_quad(struct vertex *va, struct gl_transform t,
 
 static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
 {
-    int num_vertices = part->num_subparts * 6;
-    MP_TARRAY_GROW(part, part->vertices, part->num_vertices + num_vertices);
+    MP_TARRAY_GROW(part, part->vertices,
+                   part->num_vertices + part->num_subparts * 6);
 
     for (int n = 0; n < part->num_subparts; n++) {
         struct sub_bitmap *b = &part->subparts[n];
@@ -269,13 +266,13 @@ static void generate_verts(struct mpgl_osd_part *part, struct gl_transform t)
         uint8_t color[4] = { c >> 24, (c >> 16) & 0xff,
                             (c >> 8) & 0xff, 255 - (c & 0xff) };
 
-        write_quad(&va[n * 6], t,
+        write_quad(va, t,
                    b->x, b->y, b->x + b->dw, b->y + b->dh,
                    b->src_x, b->src_y, b->src_x + b->w, b->src_y + b->h,
                    part->w, part->h, color);
-    }
 
-    part->num_vertices += num_vertices;
+        part->num_vertices += 6;
+    }
 }
 
 // number of screen divisions per axis (x=0, y=1) for the current 3D mode
@@ -291,7 +288,7 @@ static void get_3d_side_by_side(int stereo_mode, int div[2])
 }
 
 void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
-                          struct gl_shader_cache *sc, struct fbodst target)
+                          struct gl_shader_cache *sc, struct ra_fbo fbo)
 {
     struct mpgl_osd_part *part = ctx->parts[index];
 
@@ -303,7 +300,7 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
     for (int x = 0; x < div[0]; x++) {
         for (int y = 0; y < div[1]; y++) {
             struct gl_transform t;
-            gl_transform_ortho_fbodst(&t, target);
+            gl_transform_ortho_fbo(&t, fbo);
 
             float a_x = ctx->osd_res.w * x;
             float a_y = ctx->osd_res.h * y;
@@ -317,7 +314,8 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
     const int *factors = &blend_factors[part->format][0];
     gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
 
-    gl_sc_dispatch_draw(sc, target.tex, part->vertices, part->num_vertices);
+    gl_sc_dispatch_draw(sc, fbo.tex, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
+                        sizeof(struct vertex), part->vertices, part->num_vertices);
 }
 
 static void set_res(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mode)
diff --git a/video/out/opengl/osd.h b/video/out/gpu/osd.h
index 6c2b886..00fbc49 100644
--- a/video/out/opengl/osd.h
+++ b/video/out/gpu/osd.h
@@ -18,7 +18,7 @@ void mpgl_osd_resize(struct mpgl_osd *ctx, struct mp_osd_res res, int stereo_mod
 bool mpgl_osd_draw_prepare(struct mpgl_osd *ctx, int index,
                            struct gl_shader_cache *sc);
 void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
-                          struct gl_shader_cache *sc, struct fbodst target);
+                          struct gl_shader_cache *sc, struct ra_fbo fbo);
 bool mpgl_osd_check_change(struct mpgl_osd *ctx, struct mp_osd_res *res,
                            double pts);
 
diff --git a/video/out/opengl/ra.c b/video/out/gpu/ra.c
index 208507d..fdb20fe 100644
--- a/video/out/opengl/ra.c
+++ b/video/out/gpu/ra.c
@@ -71,7 +71,7 @@ static struct ra_renderpass_input *dup_inputs(void *ta_parent,
 }
 
 // Return a newly allocated deep-copy of params.
-struct ra_renderpass_params *ra_render_pass_params_copy(void *ta_parent,
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
         const struct ra_renderpass_params *params)
 {
     struct ra_renderpass_params *res = talloc_ptrtype(ta_parent, res);
@@ -86,6 +86,65 @@ struct ra_renderpass_params *ra_render_pass_params_copy(void *ta_parent,
     return res;
 };
 
+struct glsl_fmt {
+    enum ra_ctype ctype;
+    int num_components;
+    int component_depth[4];
+    const char *glsl_format;
+};
+
+// List taken from the GLSL specification, sans snorm and sint formats
+static const struct glsl_fmt ra_glsl_fmts[] = {
+    {RA_CTYPE_FLOAT, 1, {16},             "r16f"},
+    {RA_CTYPE_FLOAT, 1, {32},             "r32f"},
+    {RA_CTYPE_FLOAT, 2, {16, 16},         "rg16f"},
+    {RA_CTYPE_FLOAT, 2, {32, 32},         "rg32f"},
+    {RA_CTYPE_FLOAT, 4, {16, 16, 16, 16}, "rgba16f"},
+    {RA_CTYPE_FLOAT, 4, {32, 32, 32, 32}, "rgba32f"},
+    {RA_CTYPE_FLOAT, 3, {11, 11, 10},     "r11f_g11f_b10f"},
+
+    {RA_CTYPE_UNORM, 1, {8},              "r8"},
+    {RA_CTYPE_UNORM, 1, {16},             "r16"},
+    {RA_CTYPE_UNORM, 2, {8,  8},          "rg8"},
+    {RA_CTYPE_UNORM, 2, {16, 16},         "rg16"},
+    {RA_CTYPE_UNORM, 4, {8,  8,  8,  8},  "rgba8"},
+    {RA_CTYPE_UNORM, 4, {16, 16, 16, 16}, "rgba16"},
+    {RA_CTYPE_UNORM, 4, {10, 10, 10,  2}, "rgb10_a2"},
+
+    {RA_CTYPE_UINT,  1, {8},              "r8ui"},
+    {RA_CTYPE_UINT,  1, {16},             "r16ui"},
+    {RA_CTYPE_UINT,  1, {32},             "r32ui"},
+    {RA_CTYPE_UINT,  2, {8,  8},          "rg8ui"},
+    {RA_CTYPE_UINT,  2, {16, 16},         "rg16ui"},
+    {RA_CTYPE_UINT,  2, {32, 32},         "rg32ui"},
+    {RA_CTYPE_UINT,  4, {8,  8,  8,  8},  "rgba8ui"},
+    {RA_CTYPE_UINT,  4, {16, 16, 16, 16}, "rgba16ui"},
+    {RA_CTYPE_UINT,  4, {32, 32, 32, 32}, "rgba32ui"},
+    {RA_CTYPE_UINT,  4, {10, 10, 10,  2}, "rgb10_a2ui"},
+};
+
+const char *ra_fmt_glsl_format(const struct ra_format *fmt)
+{
+    for (int n = 0; n < MP_ARRAY_SIZE(ra_glsl_fmts); n++) {
+        const struct glsl_fmt *gfmt = &ra_glsl_fmts[n];
+
+        if (fmt->ctype != gfmt->ctype)
+            continue;
+        if (fmt->num_components != gfmt->num_components)
+            continue;
+
+        for (int i = 0; i < fmt->num_components; i++) {
+            if (fmt->component_depth[i] != gfmt->component_depth[i])
+                goto next_fmt;
+        }
+
+        return gfmt->glsl_format;
+
+next_fmt: ; // equivalent to `continue`
+    }
+
+    return NULL;
+}
 
 // Return whether this is a tightly packed format with no external padding and
 // with the same bit size/depth in all components, and the shader returns
diff --git a/video/out/opengl/ra.h b/video/out/gpu/ra.h
index 46a69f2..934e5db 100644
--- a/video/out/opengl/ra.h
+++ b/video/out/gpu/ra.h
@@ -26,6 +26,9 @@ struct ra {
     // time.
     size_t max_shmem;
 
+    // Maximum push constant size. Set by the RA backend at init time.
+    size_t max_pushc_size;
+
     // Set of supported texture formats. Must be added by RA backend at init time.
     // If there are equivalent formats with different caveats, the preferred
     // formats should have a lower index. (E.g. GLES3 should put rg8 before la.)
@@ -47,8 +50,9 @@ enum {
     RA_CAP_BUF_RO         = 1 << 5, // supports RA_VARTYPE_BUF_RO
     RA_CAP_BUF_RW         = 1 << 6, // supports RA_VARTYPE_BUF_RW
     RA_CAP_NESTED_ARRAY   = 1 << 7, // supports nested arrays
-    RA_CAP_SHARED_BINDING = 1 << 8, // sampler/image/buffer namespaces are disjoint
-    RA_CAP_GLOBAL_UNIFORM = 1 << 9, // supports using "naked" uniforms (not UBO)
+    RA_CAP_GLOBAL_UNIFORM = 1 << 8, // supports using "naked" uniforms (not UBO)
+    RA_CAP_GATHER         = 1 << 9, // supports textureGather in GLSL
+    RA_CAP_FRAGCOORD      = 1 << 10, // supports reading from gl_FragCoord
 };
 
 enum ra_ctype {
@@ -85,6 +89,10 @@ struct ra_format {
     // shader representation is given by the special_imgfmt_desc pointer.
     int special_imgfmt;
     const struct ra_imgfmt_desc *special_imgfmt_desc;
+
+    // This gives the GLSL image format corresponding to the format, if any.
+    // (e.g. rgba16ui)
+    const char *glsl_format;
 };
 
 struct ra_tex_params {
@@ -139,13 +147,14 @@ struct ra_tex_upload_params {
     ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
 };
 
-// Buffer type hint. Setting this may result in more or less efficient
-// operation, although it shouldn't technically prohibit anything
+// Buffer usage type. This restricts what types of operations may be performed
+// on a buffer.
 enum ra_buf_type {
     RA_BUF_TYPE_INVALID,
     RA_BUF_TYPE_TEX_UPLOAD,     // texture upload buffer (pixel buffer object)
     RA_BUF_TYPE_SHADER_STORAGE, // shader buffer (SSBO), for RA_VARTYPE_BUF_RW
     RA_BUF_TYPE_UNIFORM,        // uniform buffer (UBO), for RA_VARTYPE_BUF_RO
+    RA_BUF_TYPE_VERTEX,         // not publicly usable (RA-internal usage)
 };
 
 struct ra_buf_params {
@@ -202,8 +211,8 @@ struct ra_renderpass_input {
     // RA_VARTYPE_IMG_W: image unit
     // RA_VARTYPE_BUF_* buffer binding point
     // Other uniforms: unused
-    // If RA_CAP_SHARED_BINDING is set, these may only be unique per input type.
-    // Otherwise, these must be unique for all input values.
+    // Bindings must be unique within each namespace, as specified by
+    // desc_namespace()
     int binding;
 };
 
@@ -244,6 +253,7 @@ struct ra_renderpass_params {
     // Uniforms, including texture/sampler inputs.
     struct ra_renderpass_input *inputs;
     int num_inputs;
+    size_t push_constants_size; // must be <= ra.max_pushc_size and a multiple of 4
 
     // Highly implementation-specific byte array storing a compiled version
     // of the program. Can be used to speed up shader compilation. A backend
@@ -281,7 +291,7 @@ struct ra_renderpass_params {
     const char *compute_shader;
 };
 
-struct ra_renderpass_params *ra_render_pass_params_copy(void *ta_parent,
+struct ra_renderpass_params *ra_renderpass_params_copy(void *ta_parent,
         const struct ra_renderpass_params *params);
 
 // Conflates the following typical GPU API concepts:
@@ -316,6 +326,7 @@ struct ra_renderpass_run_params {
     // even if they do not change.
     struct ra_renderpass_input_val *values;
     int num_values;
+    void *push_constants; // must be set if params.push_constants_size > 0
 
     // --- pass->params.type==RA_RENDERPASS_TYPE_RASTER only
 
@@ -369,10 +380,10 @@ struct ra_fns {
 
     void (*buf_destroy)(struct ra *ra, struct ra_buf *buf);
 
-    // Update the contents of a buffer, starting at a given offset and up to a
-    // given size, with the contents of *data. This is an extremely common
-    // operation. Calling this while the buffer is considered "in use" is an
-    // error. (See: buf_poll)
+    // Update the contents of a buffer, starting at a given offset (*must* be a
+    // multiple of 4) and up to a given size, with the contents of *data. This
+    // is an extremely common operation. Calling this while the buffer is
+    // considered "in use" is an error. (See: buf_poll)
     void (*buf_update)(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
                        const void *data, size_t size);
 
@@ -386,6 +397,15 @@ struct ra_fns {
     // but must be implemented if RA_CAP_BUF_RO is supported.
     struct ra_layout (*uniform_layout)(struct ra_renderpass_input *inp);
 
+    // Returns the layout requirements of a push constant element. Optional,
+    // but must be implemented if ra.max_pushc_size > 0.
+    struct ra_layout (*push_constant_layout)(struct ra_renderpass_input *inp);
+
+    // Returns an abstract namespace index for a given renderpass input type.
+    // This will always be a value >= 0 and < RA_VARTYPE_COUNT. This is used to
+    // figure out which inputs may share the same value of `binding`.
+    int (*desc_namespace)(enum ra_vartype type);
+
     // Clear the dst with the given color (rgba) and within the given scissor.
     // dst must have dst->params.render_dst==true. Content outside of the
     // scissor is preserved.
@@ -436,9 +456,6 @@ struct ra_fns {
     // delayed by a few frames. When no value is available, this returns 0.
     uint64_t (*timer_stop)(struct ra *ra, ra_timer *timer);
 
-    // Hint that possibly queued up commands should be sent to the GPU. Optional.
-    void (*flush)(struct ra *ra);
-
     // Associates a marker with any past error messages, for debugging
     // purposes. Optional.
     void (*debug_marker)(struct ra *ra, const char *msg);
@@ -483,6 +500,8 @@ struct ra_imgfmt_desc {
     uint8_t components[4][4];
 };
 
+const char *ra_fmt_glsl_format(const struct ra_format *fmt);
+
 bool ra_get_imgfmt_desc(struct ra *ra, int imgfmt, struct ra_imgfmt_desc *out);
 
 void ra_dump_tex_formats(struct ra *ra, int msgl);
diff --git a/video/out/opengl/shader_cache.c b/video/out/gpu/shader_cache.c
index 90a7576..6d0f370 100644
--- a/video/out/opengl/shader_cache.c
+++ b/video/out/gpu/shader_cache.c
@@ -14,7 +14,6 @@
 #include "options/path.h"
 #include "stream/stream.h"
 #include "shader_cache.h"
-#include "formats.h"
 #include "utils.h"
 
 // Force cache flush if more than this number of shaders is created.
@@ -30,6 +29,7 @@ union uniform_val {
 enum sc_uniform_type {
     SC_UNIFORM_TYPE_GLOBAL = 0, // global uniform (RA_CAP_GLOBAL_UNIFORM)
     SC_UNIFORM_TYPE_UBO = 1,    // uniform buffer (RA_CAP_BUF_RO)
+    SC_UNIFORM_TYPE_PUSHC = 2,  // push constant (ra.max_pushc_size)
 };
 
 struct sc_uniform {
@@ -38,7 +38,7 @@ struct sc_uniform {
     const char *glsl_type;
     union uniform_val v;
     char *buffer_format;
-    // for SC_UNIFORM_TYPE_UBO:
+    // for SC_UNIFORM_TYPE_UBO/PUSHC:
     struct ra_layout layout;
     size_t offset; // byte offset within the buffer
 };
@@ -57,6 +57,7 @@ struct sc_entry {
     struct timer_pool *timer;
     struct ra_buf *ubo;
     int ubo_index; // for ra_renderpass_input_val.index
+    void *pushc;
 };
 
 struct gl_shader_cache {
@@ -75,6 +76,7 @@ struct gl_shader_cache {
     // Next binding point (texture unit, image unit, buffer binding, etc.)
     // In OpenGL these are separate for each input type
     int next_binding[RA_VARTYPE_COUNT];
+    bool next_uniform_dynamic;
 
     struct ra_renderpass_params params;
 
@@ -88,6 +90,7 @@ struct gl_shader_cache {
 
     int ubo_binding;
     size_t ubo_size;
+    size_t pushc_size;
 
     struct ra_renderpass_input_val *values;
     int num_values;
@@ -105,8 +108,6 @@ struct gl_shader_cache {
     struct mpv_global *global; // can be NULL
 };
 
-static void gl_sc_reset(struct gl_shader_cache *sc);
-
 struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
                                      struct mp_log *log)
 {
@@ -121,8 +122,8 @@ struct gl_shader_cache *gl_sc_create(struct ra *ra, struct mpv_global *global,
 }
 
 // Reset the previous pass. This must be called after gl_sc_generate and before
-// starting a new shader.
-static void gl_sc_reset(struct gl_shader_cache *sc)
+// starting a new shader. It may also be called on errors.
+void gl_sc_reset(struct gl_shader_cache *sc)
 {
     sc->prelude_text.len = 0;
     sc->header_text.len = 0;
@@ -132,8 +133,10 @@ static void gl_sc_reset(struct gl_shader_cache *sc)
     sc->num_uniforms = 0;
     sc->ubo_binding = 0;
     sc->ubo_size = 0;
+    sc->pushc_size = 0;
     for (int i = 0; i < RA_VARTYPE_COUNT; i++)
         sc->next_binding[i] = 0;
+    sc->next_uniform_dynamic = false;
     sc->current_shader = NULL;
     sc->params = (struct ra_renderpass_params){0};
     sc->needs_reset = false;
@@ -141,7 +144,7 @@ static void gl_sc_reset(struct gl_shader_cache *sc)
 
 static void sc_flush_cache(struct gl_shader_cache *sc)
 {
-    MP_VERBOSE(sc, "flushing shader cache\n");
+    MP_DBG(sc, "flushing shader cache\n");
 
     for (int n = 0; n < sc->num_entries; n++) {
         struct sc_entry *e = sc->entries[n];
@@ -251,32 +254,59 @@ static struct sc_uniform *find_uniform(struct gl_shader_cache *sc,
 
 static int gl_sc_next_binding(struct gl_shader_cache *sc, enum ra_vartype type)
 {
-    if (sc->ra->caps & RA_CAP_SHARED_BINDING) {
-        return sc->next_binding[type]++;
-    } else {
-        return sc->next_binding[0]++;
-    }
+    return sc->next_binding[sc->ra->fns->desc_namespace(type)]++;
 }
 
-// Updates the UBO metadata for the given sc_uniform. Assumes sc_uniform->input
-// is already set. Also updates sc_uniform->type.
-static void update_ubo_params(struct gl_shader_cache *sc, struct sc_uniform *u)
+void gl_sc_uniform_dynamic(struct gl_shader_cache *sc)
 {
-    if (!(sc->ra->caps & RA_CAP_BUF_RO))
-        return;
+    sc->next_uniform_dynamic = true;
+}
+
+// Updates the metadata for the given sc_uniform. Assumes sc_uniform->input
+// and glsl_type/buffer_format are already set.
+static void update_uniform_params(struct gl_shader_cache *sc, struct sc_uniform *u)
+{
+    bool dynamic = sc->next_uniform_dynamic;
+    sc->next_uniform_dynamic = false;
+
+    // Try not using push constants for "large" values like matrices, since
+    // this is likely to both exceed the VGPR budget as well as the pushc size
+    // budget
+    bool try_pushc = u->input.dim_m == 1 || dynamic;
+
+    // Attempt using push constants first
+    if (try_pushc && sc->ra->glsl_vulkan && sc->ra->max_pushc_size) {
+        struct ra_layout layout = sc->ra->fns->push_constant_layout(&u->input);
+        size_t offset = MP_ALIGN_UP(sc->pushc_size, layout.align);
+        // Push constants have limited size, so make sure we don't exceed this
+        size_t new_size = offset + layout.size;
+        if (new_size <= sc->ra->max_pushc_size) {
+            u->type = SC_UNIFORM_TYPE_PUSHC;
+            u->layout = layout;
+            u->offset = offset;
+            sc->pushc_size = new_size;
+            return;
+        }
+    }
 
-    // Using UBOs with explicit layout(offset) like we do requires GLSL version
-    // 440 or higher. In theory the UBO code can also use older versions, but
-    // just try and avoid potential headaches. This also ensures they're only
-    // used on drivers that are probably modern enough to actually support them
-    // correctly.
-    if (sc->ra->glsl_version < 440)
+    // Attempt using uniform buffer next. The GLSL version 440 check is due
+    // to explicit offsets on UBO entries. In theory we could leave away
+    // the offsets and support UBOs for older GL as well, but this is a nice
+    // safety net for driver bugs (and also rules out potentially buggy drivers)
+    // Also avoid UBOs for highly dynamic stuff since that requires synchronizing
+    // the UBO writes every frame
+    bool try_ubo = !(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM) || !dynamic;
+    if (try_ubo && sc->ra->glsl_version >= 440 && (sc->ra->caps & RA_CAP_BUF_RO)) {
+        u->type = SC_UNIFORM_TYPE_UBO;
+        u->layout = sc->ra->fns->uniform_layout(&u->input);
+        u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align);
+        sc->ubo_size = u->offset + u->layout.size;
         return;
+    }
 
-    u->type = SC_UNIFORM_TYPE_UBO;
-    u->layout = sc->ra->fns->uniform_layout(&u->input);
-    u->offset = MP_ALIGN_UP(sc->ubo_size, u->layout.align);
-    sc->ubo_size = u->offset + u->layout.size;
+    // If all else fails, use global uniforms
+    assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM);
+    u->type = SC_UNIFORM_TYPE_GLOBAL;
 }
 
 void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
@@ -337,7 +367,7 @@ void gl_sc_uniform_f(struct gl_shader_cache *sc, char *name, float f)
     struct sc_uniform *u = find_uniform(sc, name);
     u->input.type = RA_VARTYPE_FLOAT;
     u->glsl_type = "float";
-    update_ubo_params(sc, u);
+    update_uniform_params(sc, u);
     u->v.f[0] = f;
 }
 
@@ -346,7 +376,7 @@ void gl_sc_uniform_i(struct gl_shader_cache *sc, char *name, int i)
     struct sc_uniform *u = find_uniform(sc, name);
     u->input.type = RA_VARTYPE_INT;
     u->glsl_type = "int";
-    update_ubo_params(sc, u);
+    update_uniform_params(sc, u);
     u->v.i[0] = i;
 }
 
@@ -356,18 +386,18 @@ void gl_sc_uniform_vec2(struct gl_shader_cache *sc, char *name, float f[2])
     u->input.type = RA_VARTYPE_FLOAT;
     u->input.dim_v = 2;
     u->glsl_type = "vec2";
-    update_ubo_params(sc, u);
+    update_uniform_params(sc, u);
     u->v.f[0] = f[0];
     u->v.f[1] = f[1];
 }
 
-void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, GLfloat f[3])
+void gl_sc_uniform_vec3(struct gl_shader_cache *sc, char *name, float f[3])
 {
     struct sc_uniform *u = find_uniform(sc, name);
     u->input.type = RA_VARTYPE_FLOAT;
     u->input.dim_v = 3;
     u->glsl_type = "vec3";
-    update_ubo_params(sc, u);
+    update_uniform_params(sc, u);
     u->v.f[0] = f[0];
     u->v.f[1] = f[1];
     u->v.f[2] = f[2];
@@ -379,14 +409,14 @@ static void transpose2x2(float r[2 * 2])
 }
 
 void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
-                        bool transpose, GLfloat *v)
+                        bool transpose, float *v)
 {
     struct sc_uniform *u = find_uniform(sc, name);
     u->input.type = RA_VARTYPE_FLOAT;
     u->input.dim_v = 2;
     u->input.dim_m = 2;
     u->glsl_type = "mat2";
-    update_ubo_params(sc, u);
+    update_uniform_params(sc, u);
     for (int n = 0; n < 4; n++)
         u->v.f[n] = v[n];
     if (transpose)
@@ -401,34 +431,20 @@ static void transpose3x3(float r[3 * 3])
 }
 
 void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
-                        bool transpose, GLfloat *v)
+                        bool transpose, float *v)
 {
     struct sc_uniform *u = find_uniform(sc, name);
     u->input.type = RA_VARTYPE_FLOAT;
     u->input.dim_v = 3;
     u->input.dim_m = 3;
     u->glsl_type = "mat3";
-    update_ubo_params(sc, u);
+    update_uniform_params(sc, u);
     for (int n = 0; n < 9; n++)
         u->v.f[n] = v[n];
     if (transpose)
         transpose3x3(&u->v.f[0]);
 }
 
-// Tell the shader generator (and later gl_sc_draw_data()) about the vertex
-// data layout and attribute names. The entries array is terminated with a {0}
-// entry. The array memory must remain valid indefinitely (for now).
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *entries,
-                             int vertex_stride)
-{
-    sc->params.vertex_attribs = (struct ra_renderpass_input *)entries;
-    sc->params.num_vertex_attribs = 0;
-    while (entries[sc->params.num_vertex_attribs].name)
-        sc->params.num_vertex_attribs++;
-    sc->params.vertex_stride = vertex_stride;
-}
-
 void gl_sc_blend(struct gl_shader_cache *sc,
                  enum ra_blend blend_src_rgb,
                  enum ra_blend blend_dst_rgb,
@@ -468,6 +484,20 @@ static void update_ubo(struct ra *ra, struct ra_buf *ubo, struct sc_uniform *u)
     }
 }
 
+static void update_pushc(struct ra *ra, void *pushc, struct sc_uniform *u)
+{
+    uintptr_t src = (uintptr_t) &u->v;
+    uintptr_t dst = (uintptr_t) pushc + (ptrdiff_t) u->offset;
+    struct ra_layout src_layout = ra_renderpass_input_layout(&u->input);
+    struct ra_layout dst_layout = u->layout;
+
+    for (int i = 0; i < u->input.dim_m; i++) {
+        memcpy((void *)dst, (void *)src, src_layout.stride);
+        src += src_layout.stride;
+        dst += dst_layout.stride;
+    }
+}
+
 static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
                            struct sc_uniform *u, int n)
 {
@@ -479,6 +509,13 @@ static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
     un->v = u->v;
     un->set = true;
 
+    static const char *desc[] = {
+        [SC_UNIFORM_TYPE_UBO]    = "UBO",
+        [SC_UNIFORM_TYPE_PUSHC]  = "PC",
+        [SC_UNIFORM_TYPE_GLOBAL] = "global",
+    };
+    MP_TRACE(sc, "Updating %s uniform '%s'\n", desc[u->type], u->input.name);
+
     switch (u->type) {
     case SC_UNIFORM_TYPE_GLOBAL: {
         struct ra_renderpass_input_val value = {
@@ -492,6 +529,10 @@ static void update_uniform(struct gl_shader_cache *sc, struct sc_entry *e,
         assert(e->ubo);
         update_ubo(sc->ra, e->ubo, u);
         break;
+    case SC_UNIFORM_TYPE_PUSHC:
+        assert(e->pushc);
+        update_pushc(sc->ra, e->pushc, u);
+        break;
     default: abort();
     }
 }
@@ -509,25 +550,6 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
     void *tmp = talloc_new(NULL);
     struct ra_renderpass_params params = sc->params;
 
-    MP_VERBOSE(sc, "new shader program:\n");
-    if (sc->header_text.len) {
-        MP_VERBOSE(sc, "header:\n");
-        mp_log_source(sc->log, MSGL_V, sc->header_text.start);
-        MP_VERBOSE(sc, "body:\n");
-    }
-    if (sc->text.len)
-        mp_log_source(sc->log, MSGL_V, sc->text.start);
-
-    // The vertex shader uses mangled names for the vertex attributes, so that
-    // the fragment shader can use the "real" names. But the shader is expecting
-    // the vertex attribute names (at least with older GLSL targets for GL).
-    params.vertex_attribs = talloc_memdup(tmp, params.vertex_attribs,
-                params.num_vertex_attribs * sizeof(params.vertex_attribs[0]));
-    for (int n = 0; n < params.num_vertex_attribs; n++) {
-        struct ra_renderpass_input *attrib = &params.vertex_attribs[n];
-        attrib->name = talloc_asprintf(tmp, "vertex_%s", attrib->name);
-    }
-
     const char *cache_header = "mpv shader cache v1\n";
     char *cache_filename = NULL;
     char *cache_dir = NULL;
@@ -552,7 +574,7 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
 
         cache_filename = mp_path_join(tmp, cache_dir, hashstr);
         if (stat(cache_filename, &(struct stat){0}) == 0) {
-            MP_VERBOSE(sc, "Trying to load shader from disk...\n");
+            MP_DBG(sc, "Trying to load shader from disk...\n");
             struct bstr cachedata =
                 stream_read_file(cache_filename, tmp, sc->global, 1000000000);
             if (bstr_eatstart0(&cachedata, cache_header))
@@ -574,9 +596,10 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
         MP_TARRAY_APPEND(sc, params.inputs, params.num_inputs, ubo_input);
     }
 
-    entry->pass = sc->ra->fns->renderpass_create(sc->ra, &params);
-    if (!entry->pass)
-        goto error;
+    if (sc->pushc_size) {
+        params.push_constants_size = MP_ALIGN_UP(sc->pushc_size, 4);
+        entry->pushc = talloc_zero_size(entry, params.push_constants_size);
+    }
 
     if (sc->ubo_size) {
         struct ra_buf_params ubo_params = {
@@ -592,12 +615,16 @@ static bool create_pass(struct gl_shader_cache *sc, struct sc_entry *entry)
         }
     }
 
+    entry->pass = sc->ra->fns->renderpass_create(sc->ra, &params);
+    if (!entry->pass)
+        goto error;
+
     if (entry->pass && cache_filename) {
         bstr nc = entry->pass->params.cached_program;
         if (nc.len && !bstr_equals(params.cached_program, nc)) {
             mp_mkdirp(cache_dir);
 
-            MP_VERBOSE(sc, "Writing shader cache file: %s\n", cache_filename);
+            MP_DBG(sc, "Writing shader cache file: %s\n", cache_filename);
             FILE *out = fopen(cache_filename, "wb");
             if (out) {
                 fwrite(cache_header, strlen(cache_header), 1, out);
@@ -626,8 +653,22 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
             struct sc_uniform *u = &sc->uniforms[n];
             if (u->type != SC_UNIFORM_TYPE_UBO)
                 continue;
-            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset,
-                u->glsl_type, u->input.name);
+            ADD(dst, "layout(offset=%zu) %s %s;\n", u->offset, u->glsl_type,
+                u->input.name);
+        }
+        ADD(dst, "};\n");
+    }
+
+    // Ditto for push constants
+    if (sc->pushc_size > 0) {
+        ADD(dst, "layout(std430, push_constant) uniform PushC {\n");
+        for (int n = 0; n < sc->num_uniforms; n++) {
+            struct sc_uniform *u = &sc->uniforms[n];
+            if (u->type != SC_UNIFORM_TYPE_PUSHC)
+                continue;
+            // push constants don't support explicit offsets
+            ADD(dst, "/*offset=%zu*/ %s %s;\n", u->offset, u->glsl_type,
+                u->input.name);
         }
         ADD(dst, "};\n");
     }
@@ -642,7 +683,6 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
             assert(sc->ra->caps & RA_CAP_GLOBAL_UNIFORM);
             // fall through
         case RA_VARTYPE_TEX:
-        case RA_VARTYPE_IMG_W:
             // Vulkan requires explicitly assigning the bindings in the shader
             // source. For OpenGL it's optional, but requires higher GL version
             // so we don't do it (and instead have ra_gl update the bindings
@@ -659,6 +699,22 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
             ADD(dst, "layout(std430, binding=%d) buffer %s { %s };\n",
                 u->input.binding, u->input.name, u->buffer_format);
             break;
+        case RA_VARTYPE_IMG_W: {
+            // For better compatibility, we have to explicitly label the
+            // type of data we will be reading/writing to this image.
+            const char *fmt = u->v.tex->params.format->glsl_format;
+
+            if (sc->ra->glsl_vulkan) {
+                if (fmt) {
+                    ADD(dst, "layout(binding=%d, %s) ", u->input.binding, fmt);
+                } else {
+                    ADD(dst, "layout(binding=%d) ", u->input.binding);
+                }
+            } else if (fmt) {
+                ADD(dst, "layout(%s) ", fmt);
+            }
+            ADD(dst, "uniform %s %s;\n", u->glsl_type, u->input.name);
+        }
         }
     }
 }
@@ -674,7 +730,9 @@ static void add_uniforms(struct gl_shader_cache *sc, bstr *dst)
 //    and fragment operations needed for the next program have to be re-added.)
 static void gl_sc_generate(struct gl_shader_cache *sc,
                            enum ra_renderpass_type type,
-                           const struct ra_format *target_format)
+                           const struct ra_format *target_format,
+                           const struct ra_renderpass_input *vao,
+                           int vao_len, size_t vertex_stride)
 {
     int glsl_version = sc->ra->glsl_version;
     int glsl_es = sc->ra->glsl_es ? glsl_version : 0;
@@ -686,9 +744,6 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
     assert(!sc->needs_reset);
     sc->needs_reset = true;
 
-    // gl_sc_set_vertex_format() must always be called
-    assert(sc->params.vertex_attribs);
-
     // If using a UBO, pick a binding (needed for shader generation)
     if (sc->ubo_size)
         sc->ubo_binding = gl_sc_next_binding(sc, RA_VARTYPE_BUF_RO);
@@ -745,8 +800,8 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
         bstr *vert_body = &sc->tmp[2];
         ADD(vert_body, "void main() {\n");
         bstr *frag_vaos = &sc->tmp[3];
-        for (int n = 0; n < sc->params.num_vertex_attribs; n++) {
-            const struct ra_renderpass_input *e = &sc->params.vertex_attribs[n];
+        for (int n = 0; n < vao_len; n++) {
+            const struct ra_renderpass_input *e = &vao[n];
             const char *glsl_type = vao_glsl_type(e);
             char loc[32] = {0};
             if (sc->ra->glsl_vulkan)
@@ -857,6 +912,19 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
             .total = bstrdup(entry, *hash_total),
             .timer = timer_pool_create(sc->ra),
         };
+
+        // The vertex shader uses mangled names for the vertex attributes, so
+        // that the fragment shader can use the "real" names. But the shader is
+        // expecting the vertex attribute names (at least with older GLSL
+        // targets for GL).
+        sc->params.vertex_stride = vertex_stride;
+        for (int n = 0; n < vao_len; n++) {
+            struct ra_renderpass_input attrib = vao[n];
+            attrib.name = talloc_asprintf(entry, "vertex_%s", attrib.name);
+            MP_TARRAY_APPEND(sc, sc->params.vertex_attribs,
+                             sc->params.num_vertex_attribs, attrib);
+        }
+
         for (int n = 0; n < sc->num_uniforms; n++) {
             struct sc_cached_uniform u = {0};
             if (sc->uniforms[n].type == SC_UNIFORM_TYPE_GLOBAL) {
@@ -872,8 +940,11 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
             sc->error_state = true;
         MP_TARRAY_APPEND(sc, sc->entries, sc->num_entries, entry);
     }
-    if (sc->error_state)
+
+    if (!entry->pass) {
+        sc->current_shader = NULL;
         return;
+    }
 
     assert(sc->num_uniforms == entry->num_cached_uniforms);
 
@@ -895,11 +966,14 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
 
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
                                         struct ra_tex *target,
-                                        void *ptr, size_t num)
+                                        const struct ra_renderpass_input *vao,
+                                        int vao_len, size_t vertex_stride,
+                                        void *vertices, size_t num_vertices)
 {
     struct timer_pool *timer = NULL;
 
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format);
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format,
+                   vao, vao_len, vertex_stride);
     if (!sc->current_shader)
         goto error;
 
@@ -911,9 +985,10 @@ struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
         .pass = sc->current_shader->pass,
         .values = sc->values,
         .num_values = sc->num_values,
+        .push_constants = sc->current_shader->pushc,
         .target = target,
-        .vertex_data = ptr,
-        .vertex_count = num,
+        .vertex_data = vertices,
+        .vertex_count = num_vertices,
         .viewport = full_rc,
         .scissors = full_rc,
     };
@@ -932,7 +1007,7 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
 {
     struct timer_pool *timer = NULL;
 
-    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL);
+    gl_sc_generate(sc, RA_RENDERPASS_TYPE_COMPUTE, NULL, NULL, 0, 0);
     if (!sc->current_shader)
         goto error;
 
@@ -942,6 +1017,7 @@ struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
         .pass = sc->current_shader->pass,
         .values = sc->values,
         .num_values = sc->num_values,
+        .push_constants = sc->current_shader->pushc,
         .compute_groups = {w, h, d},
     };
 
diff --git a/video/out/opengl/shader_cache.h b/video/out/gpu/shader_cache.h
index 82a0780..2fe7dcf 100644
--- a/video/out/opengl/shader_cache.h
+++ b/video/out/gpu/shader_cache.h
@@ -25,6 +25,10 @@ void gl_sc_haddf(struct gl_shader_cache *sc, const char *textf, ...)
 void gl_sc_hadd_bstr(struct gl_shader_cache *sc, struct bstr text);
 void gl_sc_paddf(struct gl_shader_cache *sc, const char *textf, ...)
     PRINTF_ATTRIBUTE(2, 3);
+
+// A hint that the next data-type (i.e. non-binding) uniform is expected to
+// change frequently. This refers to the _f, _i, _vecN etc. uniform types.
+void gl_sc_uniform_dynamic(struct gl_shader_cache *sc);
 void gl_sc_uniform_texture(struct gl_shader_cache *sc, char *name,
                            struct ra_tex *tex);
 void gl_sc_uniform_image2D_wo(struct gl_shader_cache *sc, const char *name,
@@ -39,9 +43,6 @@ void gl_sc_uniform_mat2(struct gl_shader_cache *sc, char *name,
                         bool transpose, float *v);
 void gl_sc_uniform_mat3(struct gl_shader_cache *sc, char *name,
                         bool transpose, float *v);
-void gl_sc_set_vertex_format(struct gl_shader_cache *sc,
-                             const struct ra_renderpass_input *vertex_attribs,
-                             int vertex_stride);
 void gl_sc_blend(struct gl_shader_cache *sc,
                  enum ra_blend blend_src_rgb,
                  enum ra_blend blend_dst_rgb,
@@ -50,7 +51,12 @@ void gl_sc_blend(struct gl_shader_cache *sc,
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
                                         struct ra_tex *target,
+                                        const struct ra_renderpass_input *vao,
+                                        int vao_len, size_t vertex_stride,
                                         void *ptr, size_t num);
 struct mp_pass_perf gl_sc_dispatch_compute(struct gl_shader_cache *sc,
                                            int w, int h, int d);
+// The application can call this on errors, to reset the current shader. This
+// is normally done implicitly by gl_sc_dispatch_*
+void gl_sc_reset(struct gl_shader_cache *sc);
 void gl_sc_set_cache_dir(struct gl_shader_cache *sc, const char *dir);
diff --git a/video/out/gpu/spirv.c b/video/out/gpu/spirv.c
new file mode 100644
index 0000000..e20fbe7
--- /dev/null
+++ b/video/out/gpu/spirv.c
@@ -0,0 +1,78 @@
+#include "common/msg.h"
+#include "options/m_config.h"
+
+#include "spirv.h"
+#include "config.h"
+
+extern const struct spirv_compiler_fns spirv_shaderc;
+extern const struct spirv_compiler_fns spirv_nvidia_builtin;
+
+// in probe-order
+enum {
+    SPIRV_AUTO = 0,
+    SPIRV_SHADERC, // generally preferred, but not packaged everywhere
+    SPIRV_NVIDIA,  // can be useful for testing, only available on nvidia
+};
+
+static const struct spirv_compiler_fns *compilers[] = {
+#if HAVE_SHADERC
+    [SPIRV_SHADERC] = &spirv_shaderc,
+#endif
+#if HAVE_VULKAN
+    [SPIRV_NVIDIA]  = &spirv_nvidia_builtin,
+#endif
+};
+
+static const struct m_opt_choice_alternatives compiler_choices[] = {
+    {"auto",        SPIRV_AUTO},
+#if HAVE_SHADERC
+    {"shaderc",     SPIRV_SHADERC},
+#endif
+#if HAVE_VULKAN
+    {"nvidia",      SPIRV_NVIDIA},
+#endif
+    {0}
+};
+
+struct spirv_opts {
+    int compiler;
+};
+
+#define OPT_BASE_STRUCT struct spirv_opts
+const struct m_sub_options spirv_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_CHOICE_C("spirv-compiler", compiler, 0, compiler_choices),
+        {0}
+    },
+    .size = sizeof(struct spirv_opts),
+};
+
+bool spirv_compiler_init(struct ra_ctx *ctx)
+{
+    void *tmp = talloc_new(NULL);
+    struct spirv_opts *opts = mp_get_config_group(tmp, ctx->global, &spirv_conf);
+    int compiler = opts->compiler;
+    talloc_free(tmp);
+
+    for (int i = SPIRV_AUTO+1; i < MP_ARRAY_SIZE(compilers); i++) {
+        if (compiler != SPIRV_AUTO && i != compiler)
+            continue;
+        if (!compilers[i])
+            continue;
+
+        ctx->spirv = talloc_zero(ctx, struct spirv_compiler);
+        ctx->spirv->log = ctx->log,
+        ctx->spirv->fns = compilers[i];
+
+        const char *name = m_opt_choice_str(compiler_choices, i);
+        strncpy(ctx->spirv->name, name, sizeof(ctx->spirv->name));
+        MP_VERBOSE(ctx, "Initializing SPIR-V compiler '%s'\n", name);
+        if (ctx->spirv->fns->init(ctx))
+            return true;
+        talloc_free(ctx->spirv);
+        ctx->spirv = NULL;
+    }
+
+    MP_ERR(ctx, "Failed initializing SPIR-V compiler!\n");
+    return false;
+}
diff --git a/video/out/gpu/spirv.h b/video/out/gpu/spirv.h
new file mode 100644
index 0000000..e3dbd4f
--- /dev/null
+++ b/video/out/gpu/spirv.h
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "common/msg.h"
+#include "common/common.h"
+#include "context.h"
+
+enum glsl_shader {
+    GLSL_SHADER_VERTEX,
+    GLSL_SHADER_FRAGMENT,
+    GLSL_SHADER_COMPUTE,
+};
+
+#define SPIRV_NAME_MAX_LEN 32
+
+struct spirv_compiler {
+    char name[SPIRV_NAME_MAX_LEN];
+    const struct spirv_compiler_fns *fns;
+    struct mp_log *log;
+    void *priv;
+
+    const char *required_ext; // or NULL
+    int glsl_version;         // GLSL version supported
+    int compiler_version;     // for cache invalidation, may be left as 0
+    int ra_caps;              // RA_CAP_* provided by this implementation, if any
+};
+
+struct spirv_compiler_fns {
+    // Compile GLSL to SPIR-V, under GL_KHR_vulkan_glsl semantics.
+    bool (*compile_glsl)(struct spirv_compiler *spirv, void *tactx,
+                         enum glsl_shader type, const char *glsl,
+                         struct bstr *out_spirv);
+
+    // Called by spirv_compiler_init / ra_ctx_destroy. These don't need to
+    // allocate/free ctx->spirv, that is done by the caller
+    bool (*init)(struct ra_ctx *ctx);
+    void (*uninit)(struct ra_ctx *ctx); // optional
+};
+
+// Initializes ctx->spirv to a valid SPIR-V compiler, or returns false on
+// failure. Cleanup will be handled by ra_ctx_destroy.
+bool spirv_compiler_init(struct ra_ctx *ctx);
diff --git a/video/out/gpu/spirv_shaderc.c b/video/out/gpu/spirv_shaderc.c
new file mode 100644
index 0000000..ee70205
--- /dev/null
+++ b/video/out/gpu/spirv_shaderc.c
@@ -0,0 +1,125 @@
+#include "common/msg.h"
+
+#include "context.h"
+#include "spirv.h"
+
+#include <shaderc/shaderc.h>
+
+struct priv {
+    shaderc_compiler_t compiler;
+    shaderc_compile_options_t opts;
+};
+
+static void shaderc_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->spirv->priv;
+    if (!p)
+        return;
+
+    shaderc_compile_options_release(p->opts);
+    shaderc_compiler_release(p->compiler);
+}
+
+static bool shaderc_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->spirv->priv = talloc_zero(ctx->spirv, struct priv);
+
+    p->compiler = shaderc_compiler_initialize();
+    if (!p->compiler)
+        goto error;
+    p->opts = shaderc_compile_options_initialize();
+    if (!p->opts)
+        goto error;
+
+    shaderc_compile_options_set_optimization_level(p->opts,
+                                            shaderc_optimization_level_size);
+    if (ctx->opts.debug)
+        shaderc_compile_options_set_generate_debug_info(p->opts);
+
+    int ver, rev;
+    shaderc_get_spv_version(&ver, &rev);
+    ctx->spirv->compiler_version = ver * 100 + rev; // forwards compatibility
+    ctx->spirv->glsl_version = 450; // impossible to query?
+    return true;
+
+error:
+    shaderc_uninit(ctx);
+    return false;
+}
+
+static shaderc_compilation_result_t compile(struct priv *p,
+                                            enum glsl_shader type,
+                                            const char *glsl, bool debug)
+{
+    static const shaderc_shader_kind kinds[] = {
+        [GLSL_SHADER_VERTEX]   = shaderc_glsl_vertex_shader,
+        [GLSL_SHADER_FRAGMENT] = shaderc_glsl_fragment_shader,
+        [GLSL_SHADER_COMPUTE]  = shaderc_glsl_compute_shader,
+    };
+
+    if (debug) {
+        return shaderc_compile_into_spv_assembly(p->compiler, glsl, strlen(glsl),
+                                        kinds[type], "input", "main", p->opts);
+    } else {
+        return shaderc_compile_into_spv(p->compiler, glsl, strlen(glsl),
+                                        kinds[type], "input", "main", p->opts);
+    }
+}
+
+static bool shaderc_compile(struct spirv_compiler *spirv, void *tactx,
+                            enum glsl_shader type, const char *glsl,
+                            struct bstr *out_spirv)
+{
+    struct priv *p = spirv->priv;
+
+    shaderc_compilation_result_t res = compile(p, type, glsl, false);
+    int errs = shaderc_result_get_num_errors(res),
+        warn = shaderc_result_get_num_warnings(res),
+        msgl = errs ? MSGL_ERR : warn ? MSGL_WARN : MSGL_V;
+
+    const char *msg = shaderc_result_get_error_message(res);
+    if (msg[0])
+        MP_MSG(spirv, msgl, "shaderc output:\n%s", msg);
+
+    int s = shaderc_result_get_compilation_status(res);
+    bool success = s == shaderc_compilation_status_success;
+
+    static const char *results[] = {
+        [shaderc_compilation_status_success]            = "success",
+        [shaderc_compilation_status_invalid_stage]      = "invalid stage",
+        [shaderc_compilation_status_compilation_error]  = "error",
+        [shaderc_compilation_status_internal_error]     = "internal error",
+        [shaderc_compilation_status_null_result_object] = "no result",
+        [shaderc_compilation_status_invalid_assembly]   = "invalid assembly",
+    };
+
+    const char *status = s < MP_ARRAY_SIZE(results) ? results[s] : "unknown";
+    MP_MSG(spirv, msgl, "shaderc compile status '%s' (%d errors, %d warnings)\n",
+           status, errs, warn);
+
+    if (success) {
+        void *bytes = (void *) shaderc_result_get_bytes(res);
+        out_spirv->len = shaderc_result_get_length(res);
+        out_spirv->start = talloc_memdup(tactx, bytes, out_spirv->len);
+    }
+
+    // Also print SPIR-V disassembly for debugging purposes. Unfortunately
+    // there doesn't seem to be a way to get this except compiling the shader
+    // a second time..
+    if (mp_msg_test(spirv->log, MSGL_TRACE)) {
+        shaderc_compilation_result_t dis = compile(p, type, glsl, true);
+        MP_TRACE(spirv, "Generated SPIR-V:\n%.*s",
+                 (int)shaderc_result_get_length(dis),
+                 shaderc_result_get_bytes(dis));
+        shaderc_result_release(dis);
+    }
+
+    shaderc_result_release(res);
+    return success;
+}
+
+const struct spirv_compiler_fns spirv_shaderc = {
+    .compile_glsl = shaderc_compile,
+    .init = shaderc_init,
+    .uninit = shaderc_uninit,
+};
diff --git a/video/out/opengl/user_shaders.c b/video/out/gpu/user_shaders.c
index 58a1ac9..446941b 100644
--- a/video/out/opengl/user_shaders.c
+++ b/video/out/gpu/user_shaders.c
@@ -17,9 +17,9 @@
 
 #include <assert.h>
 
+#include "common/msg.h"
 #include "misc/ctype.h"
 #include "user_shaders.h"
-#include "formats.h"
 
 static bool parse_rpn_szexpr(struct bstr line, struct szexp out[MAX_SZEXP_SIZE])
 {
diff --git a/video/out/opengl/user_shaders.h b/video/out/gpu/user_shaders.h
index 94a070c..8d8cc6b 100644
--- a/video/out/opengl/user_shaders.h
+++ b/video/out/gpu/user_shaders.h
@@ -21,10 +21,8 @@
 #include "utils.h"
 #include "ra.h"
 
-#define SHADER_MAX_PASSES 32
 #define SHADER_MAX_HOOKS 16
-#define SHADER_MAX_BINDS 6
-#define SHADER_MAX_SAVED 64
+#define SHADER_MAX_BINDS 16
 #define MAX_SZEXP_SIZE 32
 
 enum szexp_op {
diff --git a/video/out/gpu/utils.c b/video/out/gpu/utils.c
new file mode 100644
index 0000000..078a31c
--- /dev/null
+++ b/video/out/gpu/utils.c
@@ -0,0 +1,332 @@
+#include "common/msg.h"
+#include "video/out/vo.h"
+#include "utils.h"
+
+// Standard parallel 2D projection, except y1 < y0 means that the coordinate
+// system is flipped, not the projection.
+void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
+                        float y0, float y1)
+{
+    if (y1 < y0) {
+        float tmp = y0;
+        y0 = tmp - y1;
+        y1 = tmp;
+    }
+
+    t->m[0][0] = 2.0f / (x1 - x0);
+    t->m[0][1] = 0.0f;
+    t->m[1][0] = 0.0f;
+    t->m[1][1] = 2.0f / (y1 - y0);
+    t->t[0] = -(x1 + x0) / (x1 - x0);
+    t->t[1] = -(y1 + y0) / (y1 - y0);
+}
+
+// Apply the effects of one transformation to another, transforming it in the
+// process. In other words: post-composes t onto x
+void gl_transform_trans(struct gl_transform t, struct gl_transform *x)
+{
+    struct gl_transform xt = *x;
+    x->m[0][0] = t.m[0][0] * xt.m[0][0] + t.m[0][1] * xt.m[1][0];
+    x->m[1][0] = t.m[1][0] * xt.m[0][0] + t.m[1][1] * xt.m[1][0];
+    x->m[0][1] = t.m[0][0] * xt.m[0][1] + t.m[0][1] * xt.m[1][1];
+    x->m[1][1] = t.m[1][0] * xt.m[0][1] + t.m[1][1] * xt.m[1][1];
+    gl_transform_vec(t, &x->t[0], &x->t[1]);
+}
+
+void gl_transform_ortho_fbo(struct gl_transform *t, struct ra_fbo fbo)
+{
+    int y_dir = fbo.flip ? -1 : 1;
+    gl_transform_ortho(t, 0, fbo.tex->params.w, 0, fbo.tex->params.h * y_dir);
+}
+
+void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool)
+{
+    for (int i = 0; i < pool->num_buffers; i++)
+        ra_buf_free(ra, &pool->buffers[i]);
+
+    talloc_free(pool->buffers);
+    *pool = (struct ra_buf_pool){0};
+}
+
+static bool ra_buf_params_compatible(const struct ra_buf_params *new,
+                                     const struct ra_buf_params *old)
+{
+    return new->type == old->type &&
+           new->size <= old->size &&
+           new->host_mapped  == old->host_mapped &&
+           new->host_mutable == old->host_mutable;
+}
+
+static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
+{
+    struct ra_buf *buf = ra_buf_create(ra, &pool->current_params);
+    if (!buf)
+        return false;
+
+    MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
+    MP_VERBOSE(ra, "Resized buffer pool of type %u to size %d\n",
+               pool->current_params.type, pool->num_buffers);
+    return true;
+}
+
+struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
+                               const struct ra_buf_params *params)
+{
+    assert(!params->initial_data);
+
+    if (!ra_buf_params_compatible(params, &pool->current_params)) {
+        ra_buf_pool_uninit(ra, pool);
+        pool->current_params = *params;
+    }
+
+    // Make sure we have at least one buffer available
+    if (!pool->buffers && !ra_buf_pool_grow(ra, pool))
+        return NULL;
+
+    // Make sure the next buffer is available for use
+    if (!ra->fns->buf_poll(ra, pool->buffers[pool->index]) &&
+        !ra_buf_pool_grow(ra, pool))
+    {
+        return NULL;
+    }
+
+    struct ra_buf *buf = pool->buffers[pool->index++];
+    pool->index %= pool->num_buffers;
+
+    return buf;
+}
+
+bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
+                       const struct ra_tex_upload_params *params)
+{
+    if (params->buf)
+        return ra->fns->tex_upload(ra, params);
+
+    struct ra_tex *tex = params->tex;
+    size_t row_size = tex->params.dimensions == 2 ? params->stride :
+                      tex->params.w * tex->params.format->pixel_size;
+
+    int height = tex->params.h;
+    if (tex->params.dimensions == 2 && params->rc)
+        height = mp_rect_h(*params->rc);
+
+    struct ra_buf_params bufparams = {
+        .type = RA_BUF_TYPE_TEX_UPLOAD,
+        .size = row_size * height * tex->params.d,
+        .host_mutable = true,
+    };
+
+    struct ra_buf *buf = ra_buf_pool_get(ra, pbo, &bufparams);
+    if (!buf)
+        return false;
+
+    ra->fns->buf_update(ra, buf, 0, params->src, bufparams.size);
+
+    struct ra_tex_upload_params newparams = *params;
+    newparams.buf = buf;
+    newparams.src = NULL;
+
+    return ra->fns->tex_upload(ra, &newparams);
+}
+
+struct ra_layout std140_layout(struct ra_renderpass_input *inp)
+{
+    size_t el_size = ra_vartype_size(inp->type);
+
+    // std140 packing rules:
+    // 1. The alignment of generic values is their size in bytes
+    // 2. The alignment of vectors is the vector length * the base count, with
+    // the exception of vec3 which is always aligned like vec4
+    // 3. The alignment of arrays is that of the element size rounded up to
+    // the nearest multiple of vec4
+    // 4. Matrices are treated like arrays of vectors
+    // 5. Arrays/matrices are laid out with a stride equal to the alignment
+    size_t size = el_size * inp->dim_v;
+    if (inp->dim_v == 3)
+        size += el_size;
+    if (inp->dim_m > 1)
+        size = MP_ALIGN_UP(size, sizeof(float[4]));
+
+    return (struct ra_layout) {
+        .align  = size,
+        .stride = size,
+        .size   = size * inp->dim_m,
+    };
+}
+
+struct ra_layout std430_layout(struct ra_renderpass_input *inp)
+{
+    size_t el_size = ra_vartype_size(inp->type);
+
+    // std430 packing rules: like std140, except arrays/matrices are always
+    // "tightly" packed, even arrays/matrices of vec3s
+    size_t size = el_size * inp->dim_v;
+    if (inp->dim_v == 3 && inp->dim_m == 1)
+        size += el_size;
+
+    return (struct ra_layout) {
+        .align  = size,
+        .stride = size,
+        .size   = size * inp->dim_m,
+    };
+}
+
+// Resize a texture to a new desired size and format if necessary
+bool ra_tex_resize(struct ra *ra, struct mp_log *log, struct ra_tex **tex,
+                   int w, int h, const struct ra_format *fmt)
+{
+    if (*tex) {
+        struct ra_tex_params cur_params = (*tex)->params;
+        if (cur_params.w == w && cur_params.h == h && cur_params.format == fmt)
+            return true;
+    }
+
+    mp_dbg(log, "Resizing texture: %dx%d\n", w, h);
+
+    if (!fmt || !fmt->renderable || !fmt->linear_filter) {
+        mp_err(log, "Format %s not supported.\n", fmt ? fmt->name : "(unset)");
+        return false;
+    }
+
+    ra_tex_free(ra, tex);
+    struct ra_tex_params params = {
+        .dimensions = 2,
+        .w = w,
+        .h = h,
+        .d = 1,
+        .format = fmt,
+        .src_linear = true,
+        .render_src = true,
+        .render_dst = true,
+        .storage_dst = true,
+        .blit_src = true,
+    };
+
+    *tex = ra_tex_create(ra, &params);
+    if (!*tex)
+        mp_err(log, "Error: texture could not be created.\n");
+
+    return *tex;
+}
+
+struct timer_pool {
+    struct ra *ra;
+    ra_timer *timer;
+    bool running; // detect invalid usage
+
+    uint64_t samples[VO_PERF_SAMPLE_COUNT];
+    int sample_idx;
+    int sample_count;
+
+    uint64_t sum;
+    uint64_t peak;
+};
+
+struct timer_pool *timer_pool_create(struct ra *ra)
+{
+    if (!ra->fns->timer_create)
+        return NULL;
+
+    ra_timer *timer = ra->fns->timer_create(ra);
+    if (!timer)
+        return NULL;
+
+    struct timer_pool *pool = talloc(NULL, struct timer_pool);
+    if (!pool) {
+        ra->fns->timer_destroy(ra, timer);
+        return NULL;
+    }
+
+    *pool = (struct timer_pool){ .ra = ra, .timer = timer };
+    return pool;
+}
+
+void timer_pool_destroy(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    pool->ra->fns->timer_destroy(pool->ra, pool->timer);
+    talloc_free(pool);
+}
+
+void timer_pool_start(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    assert(!pool->running);
+    pool->ra->fns->timer_start(pool->ra, pool->timer);
+    pool->running = true;
+}
+
+void timer_pool_stop(struct timer_pool *pool)
+{
+    if (!pool)
+        return;
+
+    assert(pool->running);
+    uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
+    pool->running = false;
+
+    if (res) {
+        // Input res into the buffer and grab the previous value
+        uint64_t old = pool->samples[pool->sample_idx];
+        pool->sample_count = MPMIN(pool->sample_count + 1, VO_PERF_SAMPLE_COUNT);
+        pool->samples[pool->sample_idx++] = res;
+        pool->sample_idx %= VO_PERF_SAMPLE_COUNT;
+        pool->sum = pool->sum + res - old;
+
+        // Update peak if necessary
+        if (res >= pool->peak) {
+            pool->peak = res;
+        } else if (pool->peak == old) {
+            // It's possible that the last peak was the value we just removed,
+            // if so we need to scan for the new peak
+            uint64_t peak = res;
+            for (int i = 0; i < VO_PERF_SAMPLE_COUNT; i++)
+                peak = MPMAX(peak, pool->samples[i]);
+            pool->peak = peak;
+        }
+    }
+}
+
+struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
+{
+    if (!pool)
+        return (struct mp_pass_perf){0};
+
+    struct mp_pass_perf res = {
+        .peak = pool->peak,
+        .count = pool->sample_count,
+    };
+
+    int idx = pool->sample_idx - pool->sample_count + VO_PERF_SAMPLE_COUNT;
+    for (int i = 0; i < res.count; i++) {
+        idx %= VO_PERF_SAMPLE_COUNT;
+        res.samples[i] = pool->samples[idx++];
+    }
+
+    if (res.count > 0) {
+        res.last = res.samples[res.count - 1];
+        res.avg = pool->sum / res.count;
+    }
+
+    return res;
+}
+
+void mp_log_source(struct mp_log *log, int lev, const char *src)
+{
+    int line = 1;
+    if (!src)
+        return;
+    while (*src) {
+        const char *end = strchr(src, '\n');
+        const char *next = end + 1;
+        if (!end)
+            next = end = src + strlen(src);
+        mp_msg(log, lev, "[%3d] %.*s\n", line, (int)(end - src), src);
+        line++;
+        src = next;
+    }
+}
diff --git a/video/out/gpu/utils.h b/video/out/gpu/utils.h
new file mode 100644
index 0000000..ac0cbf2
--- /dev/null
+++ b/video/out/gpu/utils.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <stdbool.h>
+#include <math.h>
+
+#include "ra.h"
+#include "context.h"
+
+// A 3x2 matrix, with the translation part separate.
+struct gl_transform {
+    // row-major, e.g. in mathematical notation:
+    //  | m[0][0] m[0][1] |
+    //  | m[1][0] m[1][1] |
+    float m[2][2];
+    float t[2];
+};
+
+static const struct gl_transform identity_trans = {
+    .m = {{1.0, 0.0}, {0.0, 1.0}},
+    .t = {0.0, 0.0},
+};
+
+void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
+                        float y0, float y1);
+
+// This treats m as an affine transformation, in other words m[2][n] gets
+// added to the output.
+static inline void gl_transform_vec(struct gl_transform t, float *x, float *y)
+{
+    float vx = *x, vy = *y;
+    *x = vx * t.m[0][0] + vy * t.m[0][1] + t.t[0];
+    *y = vx * t.m[1][0] + vy * t.m[1][1] + t.t[1];
+}
+
+struct mp_rect_f {
+    float x0, y0, x1, y1;
+};
+
+// Semantic equality (fuzzy comparison)
+static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
+{
+    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
+           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
+}
+
+static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
+{
+    gl_transform_vec(t, &r->x0, &r->y0);
+    gl_transform_vec(t, &r->x1, &r->y1);
+}
+
+static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
+{
+    for (int x = 0; x < 2; x++) {
+        for (int y = 0; y < 2; y++) {
+            if (a.m[x][y] != b.m[x][y])
+                return false;
+        }
+    }
+
+    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
+}
+
+void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
+
+void gl_transform_ortho_fbo(struct gl_transform *t, struct ra_fbo fbo);
+
+// A pool of buffers, which can grow as needed
+struct ra_buf_pool {
+    struct ra_buf_params current_params;
+    struct ra_buf **buffers;
+    int num_buffers;
+    int index;
+};
+
+void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool);
+
+// Note: params->initial_data is *not* supported
+struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
+                               const struct ra_buf_params *params);
+
+// Helper that wraps ra_tex_upload using texture upload buffers to ensure that
+// params->buf is always set. This is intended for RA-internal usage.
+bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
+                       const struct ra_tex_upload_params *params);
+
+// Layout rules for GLSL's packing modes
+struct ra_layout std140_layout(struct ra_renderpass_input *inp);
+struct ra_layout std430_layout(struct ra_renderpass_input *inp);
+
+bool ra_tex_resize(struct ra *ra, struct mp_log *log, struct ra_tex **tex,
+                   int w, int h, const struct ra_format *fmt);
+
+// A wrapper around ra_timer that does result pooling, averaging etc.
+struct timer_pool;
+
+struct timer_pool *timer_pool_create(struct ra *ra);
+void timer_pool_destroy(struct timer_pool *pool);
+void timer_pool_start(struct timer_pool *pool);
+void timer_pool_stop(struct timer_pool *pool);
+struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);
+
+// print a multi line string with line numbers (e.g. for shader sources)
+// log, lev: module and log level, as in mp_msg()
+void mp_log_source(struct mp_log *log, int lev, const char *src);
diff --git a/video/out/opengl/video.c b/video/out/gpu/video.c
index 3362381..f80d63a 100644
--- a/video/out/opengl/video.c
+++ b/video/out/gpu/video.c
@@ -60,28 +60,12 @@ static const char *const fixed_tscale_filters[] = {
 // must be sorted, and terminated with 0
 int filter_sizes[] =
     {2, 4, 6, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64, 0};
-int tscale_sizes[] = {2, 4, 6, 0}; // limited by TEXUNIT_VIDEO_NUM
+int tscale_sizes[] = {2, 4, 6, 8, 0};
 
 struct vertex_pt {
     float x, y;
 };
 
-struct vertex {
-    struct vertex_pt position;
-    struct vertex_pt texcoord[TEXUNIT_VIDEO_NUM];
-};
-
-static const struct ra_renderpass_input vertex_vao[] = {
-    {"position",  RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, position)},
-    {"texcoord0", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[0])},
-    {"texcoord1", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[1])},
-    {"texcoord2", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[2])},
-    {"texcoord3", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[3])},
-    {"texcoord4", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[4])},
-    {"texcoord5", RA_VARTYPE_FLOAT, 2, 1, offsetof(struct vertex, texcoord[5])},
-    {0}
-};
-
 struct texplane {
     struct ra_tex *tex;
     int w, h;
@@ -115,7 +99,7 @@ static const char *plane_names[] = {
 
 // A self-contained description of a source image which can be bound to a
 // texture unit and sampled from. Contains metadata about how it's to be used
-struct img_tex {
+struct image {
     enum plane_type type; // must be set to something non-zero
     int components; // number of relevant coordinates
     float multiplier; // multiplier to be used when sampling
@@ -124,10 +108,10 @@ struct img_tex {
     struct gl_transform transform; // rendering transformation
 };
 
-// A named img_tex, for user scripting purposes
-struct saved_tex {
+// A named image, for user scripting purposes
+struct saved_img {
     const char *name;
-    struct img_tex tex;
+    struct image img;
 };
 
 // A texture hook. This is some operation that transforms a named texture as
@@ -135,21 +119,21 @@ struct saved_tex {
 struct tex_hook {
     const char *save_tex;
     const char *hook_tex[SHADER_MAX_HOOKS];
-    const char *bind_tex[TEXUNIT_VIDEO_NUM];
+    const char *bind_tex[SHADER_MAX_BINDS];
     int components; // how many components are relevant (0 = same as input)
     void *priv; // this gets talloc_freed when the tex_hook is removed
-    void (*hook)(struct gl_video *p, struct img_tex tex, // generates GLSL
+    void (*hook)(struct gl_video *p, struct image img, // generates GLSL
                  struct gl_transform *trans, void *priv);
-    bool (*cond)(struct gl_video *p, struct img_tex tex, void *priv);
+    bool (*cond)(struct gl_video *p, struct image img, void *priv);
 };
 
-struct fbosurface {
-    struct fbotex fbotex;
+struct surface {
+    struct ra_tex *tex;
     uint64_t id;
     double pts;
 };
 
-#define FBOSURFACES_MAX 10
+#define SURFACES_MAX 10
 
 struct cached_file {
     char *path;
@@ -161,8 +145,6 @@ struct pass_info {
     struct mp_pass_perf perf;
 };
 
-#define PASS_INFO_MAX (SHADER_MAX_PASSES + 32)
-
 struct dr_buffer {
     struct ra_buf *buf;
     // The mpi reference will keep the data from being recycled (or from other
@@ -215,29 +197,40 @@ struct gl_video {
     bool dumb_mode;
     bool forced_dumb_mode;
 
+    // Cached vertex array, to avoid re-allocation per frame. For simplicity,
+    // our vertex format is simply a list of `vertex_pt`s, since this greatly
+    // simplifies offset calculation at the cost of (unneeded) flexibility.
+    struct vertex_pt *tmp_vertex;
+    struct ra_renderpass_input *vao;
+    int vao_len;
+
     const struct ra_format *fbo_format;
-    struct fbotex merge_fbo[4];
-    struct fbotex scale_fbo[4];
-    struct fbotex integer_fbo[4];
-    struct fbotex indirect_fbo;
-    struct fbotex blend_subs_fbo;
-    struct fbotex screen_fbo;
-    struct fbotex output_fbo;
-    struct fbosurface surfaces[FBOSURFACES_MAX];
-    struct fbotex vdpau_deinterleave_fbo[2];
+    struct ra_tex *merge_tex[4];
+    struct ra_tex *scale_tex[4];
+    struct ra_tex *integer_tex[4];
+    struct ra_tex *indirect_tex;
+    struct ra_tex *blend_subs_tex;
+    struct ra_tex *screen_tex;
+    struct ra_tex *output_tex;
+    struct ra_tex *vdpau_deinterleave_tex[2];
+    struct ra_tex **hook_textures;
+    int num_hook_textures;
+    int idx_hook_textures;
+
     struct ra_buf *hdr_peak_ssbo;
+    struct surface surfaces[SURFACES_MAX];
 
     // user pass descriptions and textures
-    struct tex_hook tex_hooks[SHADER_MAX_PASSES];
-    int tex_hook_num;
-    struct gl_user_shader_tex user_textures[SHADER_MAX_PASSES];
-    int user_tex_num;
+    struct tex_hook *tex_hooks;
+    int num_tex_hooks;
+    struct gl_user_shader_tex *user_textures;
+    int num_user_textures;
 
     int surface_idx;
     int surface_now;
     int frames_drawn;
     bool is_interpolated;
-    bool output_fbo_valid;
+    bool output_tex_valid;
 
     // state for configured scalers
     struct scaler scaler[SCALER_COUNT];
@@ -249,9 +242,15 @@ struct gl_video {
     struct mp_osd_res osd_rect; // OSD size/margins
 
     // temporary during rendering
-    struct img_tex pass_tex[TEXUNIT_VIDEO_NUM];
     struct compute_info pass_compute; // compute shader metadata for this pass
-    int pass_tex_num;
+    struct image *pass_imgs;          // bound images for this pass
+    int num_pass_imgs;
+    struct saved_img *saved_imgs;     // saved (named) images for this frame
+    int num_saved_imgs;
+
+    // effective current texture metadata - this will essentially affect the
+    // next render pass target, as well as implicitly tracking what needs to
+    // be done with the image
     int texture_w, texture_h;
     struct gl_transform texture_offset; // texture transform without rotation
     int components;
@@ -259,20 +258,14 @@ struct gl_video {
     float user_gamma;
 
     // pass info / metrics
-    struct pass_info pass_fresh[PASS_INFO_MAX];
-    struct pass_info pass_redraw[PASS_INFO_MAX];
+    struct pass_info pass_fresh[VO_PASS_PERF_MAX];
+    struct pass_info pass_redraw[VO_PASS_PERF_MAX];
     struct pass_info *pass;
     int pass_idx;
     struct timer_pool *upload_timer;
     struct timer_pool *blit_timer;
     struct timer_pool *osd_timer;
 
-    // intermediate textures
-    struct saved_tex saved_tex[SHADER_MAX_SAVED];
-    int saved_tex_num;
-    struct fbotex hook_fbos[SHADER_MAX_SAVED];
-    int hook_fbo_num;
-
     int frames_uploaded;
     int frames_rendered;
     AVLFG lfg;
@@ -284,8 +277,12 @@ struct gl_video {
     struct cached_file *files;
     int num_files;
 
-    struct ra_hwdec *hwdec;
+    bool hwdec_interop_loading_done;
+    struct ra_hwdec **hwdecs;
+    int num_hwdecs;
+
     struct ra_hwdec_mapper *hwdec_mapper;
+    struct ra_hwdec *hwdec_overlay;
     bool hwdec_active;
 
     bool dsi_warned;
@@ -318,8 +315,9 @@ static const struct gl_video_opts gl_video_opts_def = {
     .gamma = 1.0f,
     .tone_mapping = TONE_MAPPING_MOBIUS,
     .tone_mapping_param = NAN,
-    .tone_mapping_desat = 2.0,
+    .tone_mapping_desat = 1.0,
     .early_flush = -1,
+    .hwdec_interop = "auto",
 };
 
 static int validate_scaler_opt(struct mp_log *log, const m_option_t *opt,
@@ -347,9 +345,9 @@ static int validate_window_opt(struct mp_log *log, const m_option_t *opt,
 
 const struct m_sub_options gl_video_conf = {
     .opts = (const m_option_t[]) {
-        OPT_CHOICE("opengl-dumb-mode", dumb_mode, 0,
+        OPT_CHOICE("gpu-dumb-mode", dumb_mode, 0,
                    ({"auto", 0}, {"yes", 1}, {"no", -1})),
-        OPT_FLOATRANGE("opengl-gamma", gamma, 0, 0.1, 2.0),
+        OPT_FLOATRANGE("gamma-factor", gamma, 0, 0.1, 2.0),
         OPT_FLAG("gamma-auto", gamma_auto, 0),
         OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
         OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
@@ -376,7 +374,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_FLAG("sigmoid-upscaling", sigmoid_upscaling, 0),
         OPT_FLOATRANGE("sigmoid-center", sigmoid_center, 0, 0.0, 1.0),
         OPT_FLOATRANGE("sigmoid-slope", sigmoid_slope, 0, 1.0, 20.0),
-        OPT_STRING("opengl-fbo-format", fbo_format, 0),
+        OPT_STRING("fbo-format", fbo_format, 0),
         OPT_CHOICE_OR_INT("dither-depth", dither_depth, 0, -1, 16,
                           ({"no", -1}, {"auto", 0})),
         OPT_CHOICE("dither", dither_algo, 0,
@@ -399,18 +397,28 @@ const struct m_sub_options gl_video_conf = {
                    ({"no", BLEND_SUBS_NO},
                     {"yes", BLEND_SUBS_YES},
                     {"video", BLEND_SUBS_VIDEO})),
-        OPT_PATHLIST("opengl-shaders", user_shaders, 0),
-        OPT_CLI_ALIAS("opengl-shader", "opengl-shaders-append"),
+        OPT_PATHLIST("glsl-shaders", user_shaders, 0),
+        OPT_CLI_ALIAS("glsl-shader", "glsl-shaders-append"),
         OPT_FLAG("deband", deband, 0),
         OPT_SUBSTRUCT("deband", deband_opts, deband_conf, 0),
         OPT_FLOAT("sharpen", unsharp, 0),
-        OPT_INTRANGE("opengl-tex-pad-x", tex_pad_x, 0, 0, 4096),
-        OPT_INTRANGE("opengl-tex-pad-y", tex_pad_y, 0, 0, 4096),
+        OPT_INTRANGE("gpu-tex-pad-x", tex_pad_x, 0, 0, 4096),
+        OPT_INTRANGE("gpu-tex-pad-y", tex_pad_y, 0, 0, 4096),
         OPT_SUBSTRUCT("", icc_opts, mp_icc_conf, 0),
-        OPT_CHOICE("opengl-early-flush", early_flush, 0,
-                   ({"no", 0}, {"yes", 1}, {"auto", -1})),
-        OPT_STRING("opengl-shader-cache-dir", shader_cache_dir, 0),
+        OPT_STRING("gpu-shader-cache-dir", shader_cache_dir, 0),
+        OPT_STRING_VALIDATE("gpu-hwdec-interop", hwdec_interop, 0,
+                             ra_hwdec_validate_opt),
+        OPT_REPLACED("opengl-hwdec-interop", "gpu-hwdec-interop"),
+        OPT_REPLACED("hwdec-preload", "opengl-hwdec-interop"),
         OPT_REPLACED("hdr-tone-mapping", "tone-mapping"),
+        OPT_REPLACED("opengl-shaders", "glsl-shaders"),
+        OPT_REPLACED("opengl-shader", "glsl-shader"),
+        OPT_REPLACED("opengl-shader-cache-dir", "gpu-shader-cache-dir"),
+        OPT_REPLACED("opengl-tex-pad-x", "gpu-tex-pad-x"),
+        OPT_REPLACED("opengl-tex-pad-y", "gpu-tex-pad-y"),
+        OPT_REPLACED("opengl-fbo-format", "fbo-format"),
+        OPT_REPLACED("opengl-dumb-mode", "gpu-dumb-mode"),
+        OPT_REPLACED("opengl-gamma", "gamma-factor"),
         {0}
     },
     .size = sizeof(struct gl_video_opts),
@@ -425,6 +433,7 @@ static const char *handle_scaler_opt(const char *name, bool tscale);
 static void reinit_from_options(struct gl_video *p);
 static void get_scale_factors(struct gl_video *p, bool transpose_rot, double xy[2]);
 static void gl_video_setup_hooks(struct gl_video *p);
+static void gl_video_update_options(struct gl_video *p);
 
 #define GLSL(x) gl_sc_add(p->sc, #x "\n");
 #define GLSLF(...) gl_sc_addf(p->sc, __VA_ARGS__)
@@ -460,32 +469,32 @@ static void debug_check_gl(struct gl_video *p, const char *msg)
 
 static void gl_video_reset_surfaces(struct gl_video *p)
 {
-    for (int i = 0; i < FBOSURFACES_MAX; i++) {
+    for (int i = 0; i < SURFACES_MAX; i++) {
         p->surfaces[i].id = 0;
         p->surfaces[i].pts = MP_NOPTS_VALUE;
     }
     p->surface_idx = 0;
     p->surface_now = 0;
     p->frames_drawn = 0;
-    p->output_fbo_valid = false;
+    p->output_tex_valid = false;
 }
 
 static void gl_video_reset_hooks(struct gl_video *p)
 {
-    for (int i = 0; i < p->tex_hook_num; i++)
+    for (int i = 0; i < p->num_tex_hooks; i++)
         talloc_free(p->tex_hooks[i].priv);
 
-    for (int i = 0; i < p->user_tex_num; i++)
+    for (int i = 0; i < p->num_user_textures; i++)
         ra_tex_free(p->ra, &p->user_textures[i].tex);
 
-    p->tex_hook_num = 0;
-    p->user_tex_num = 0;
+    p->num_tex_hooks = 0;
+    p->num_user_textures = 0;
 }
 
-static inline int fbosurface_wrap(int id)
+static inline int surface_wrap(int id)
 {
-    id = id % FBOSURFACES_MAX;
-    return id < 0 ? id + FBOSURFACES_MAX : id;
+    id = id % SURFACES_MAX;
+    return id < 0 ? id + SURFACES_MAX : id;
 }
 
 static void reinit_osd(struct gl_video *p)
@@ -504,24 +513,24 @@ static void uninit_rendering(struct gl_video *p)
     ra_tex_free(p->ra, &p->dither_texture);
 
     for (int n = 0; n < 4; n++) {
-        fbotex_uninit(&p->merge_fbo[n]);
-        fbotex_uninit(&p->scale_fbo[n]);
-        fbotex_uninit(&p->integer_fbo[n]);
+        ra_tex_free(p->ra, &p->merge_tex[n]);
+        ra_tex_free(p->ra, &p->scale_tex[n]);
+        ra_tex_free(p->ra, &p->integer_tex[n]);
     }
 
-    fbotex_uninit(&p->indirect_fbo);
-    fbotex_uninit(&p->blend_subs_fbo);
-    fbotex_uninit(&p->screen_fbo);
-    fbotex_uninit(&p->output_fbo);
+    ra_tex_free(p->ra, &p->indirect_tex);
+    ra_tex_free(p->ra, &p->blend_subs_tex);
+    ra_tex_free(p->ra, &p->screen_tex);
+    ra_tex_free(p->ra, &p->output_tex);
 
-    for (int n = 0; n < FBOSURFACES_MAX; n++)
-        fbotex_uninit(&p->surfaces[n].fbotex);
+    for (int n = 0; n < SURFACES_MAX; n++)
+        ra_tex_free(p->ra, &p->surfaces[n].tex);
 
-    for (int n = 0; n < SHADER_MAX_SAVED; n++)
-        fbotex_uninit(&p->hook_fbos[n]);
+    for (int n = 0; n < p->num_hook_textures; n++)
+        ra_tex_free(p->ra, &p->hook_textures[n]);
 
     for (int n = 0; n < 2; n++)
-        fbotex_uninit(&p->vdpau_deinterleave_fbo[n]);
+        ra_tex_free(p->ra, &p->vdpau_deinterleave_tex[n]);
 
     gl_video_reset_surfaces(p);
     gl_video_reset_hooks(p);
@@ -607,29 +616,28 @@ static bool gl_video_get_lut3d(struct gl_video *p, enum mp_csp_prim prim,
     return true;
 }
 
-// Fill an img_tex struct from an FBO + some metadata
-static struct img_tex img_tex_fbo(struct fbotex *fbo, enum plane_type type,
-                                  int components)
+// Fill an image struct from a ra_tex + some metadata
+static struct image image_wrap(struct ra_tex *tex, enum plane_type type,
+                               int components)
 {
     assert(type != PLANE_NONE);
-    return (struct img_tex){
+    return (struct image){
         .type = type,
-        .tex = fbo->tex,
+        .tex = tex,
         .multiplier = 1.0,
-        .w = fbo->lw,
-        .h = fbo->lh,
+        .w = tex ? tex->params.w : 1,
+        .h = tex ? tex->params.h : 1,
         .transform = identity_trans,
         .components = components,
     };
 }
 
-// Bind an img_tex to a free texture unit and return its ID. At most
-// TEXUNIT_VIDEO_NUM texture units can be bound at once
-static int pass_bind(struct gl_video *p, struct img_tex tex)
+// Bind an image to a free texture unit and return its ID.
+static int pass_bind(struct gl_video *p, struct image img)
 {
-    assert(p->pass_tex_num < TEXUNIT_VIDEO_NUM);
-    p->pass_tex[p->pass_tex_num] = tex;
-    return p->pass_tex_num++;
+    int idx = p->num_pass_imgs;
+    MP_TARRAY_APPEND(p, p->pass_imgs, p->num_pass_imgs, img);
+    return idx;
 }
 
 // Rotation by 90° and flipping.
@@ -678,11 +686,11 @@ static enum plane_type merge_plane_types(enum plane_type a, enum plane_type b)
     return a;
 }
 
-// Places a video_image's image textures + associated metadata into tex[]. The
+// Places a video_image's image textures + associated metadata into img[]. The
 // number of textures is equal to p->plane_count. Any necessary plane offsets
 // are stored in off. (e.g. chroma position)
-static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
-                             struct img_tex tex[4], struct gl_transform off[4])
+static void pass_get_images(struct gl_video *p, struct video_image *vimg,
+                            struct image img[4], struct gl_transform off[4])
 {
     assert(vimg->mpi);
 
@@ -715,7 +723,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
                                          msb_valid_bits,
                                          p->ra_format.component_bits);
 
-    memset(tex, 0, 4 * sizeof(tex[0]));
+    memset(img, 0, 4 * sizeof(img[0]));
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *t = &vimg->planes[n];
 
@@ -737,7 +745,7 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
             type = merge_plane_types(type, ctype);
         }
 
-        tex[n] = (struct img_tex){
+        img[n] = (struct image){
             .type = type,
             .tex = t->tex,
             .multiplier = tex_mul,
@@ -746,12 +754,12 @@ static void pass_get_img_tex(struct gl_video *p, struct video_image *vimg,
         };
 
         for (int i = 0; i < 4; i++)
-            tex[n].components += !!p->ra_format.components[n][i];
+            img[n].components += !!p->ra_format.components[n][i];
 
         get_transform(t->w, t->h, p->image_params.rotate, t->flipped,
-                      &tex[n].transform);
+                      &img[n].transform);
         if (p->image_params.rotate % 180 == 90)
-            MPSWAP(int, tex[n].w, tex[n].h);
+            MPSWAP(int, img[n].w, img[n].h);
 
         off[n] = identity_trans;
 
@@ -804,18 +812,27 @@ static void init_video(struct gl_video *p)
 {
     p->use_integer_conversion = false;
 
-    if (p->hwdec && ra_hwdec_test_format(p->hwdec, p->image_params.imgfmt)) {
-        if (p->hwdec->driver->overlay_frame) {
+    struct ra_hwdec *hwdec = NULL;
+    for (int n = 0; n < p->num_hwdecs; n++) {
+        if (ra_hwdec_test_format(p->hwdecs[n], p->image_params.imgfmt)) {
+            hwdec = p->hwdecs[n];
+            break;
+        }
+    }
+
+    if (hwdec) {
+        if (hwdec->driver->overlay_frame) {
             MP_WARN(p, "Using HW-overlay mode. No GL filtering is performed "
                        "on the video!\n");
+            p->hwdec_overlay = hwdec;
         } else {
-            p->hwdec_mapper = ra_hwdec_mapper_create(p->hwdec, &p->image_params);
+            p->hwdec_mapper = ra_hwdec_mapper_create(hwdec, &p->image_params);
             if (!p->hwdec_mapper)
                 MP_ERR(p, "Initializing texture for hardware decoding failed.\n");
         }
         if (p->hwdec_mapper)
             p->image_params = p->hwdec_mapper->dst_params;
-        const char **exts = p->hwdec->glsl_extensions;
+        const char **exts = hwdec->glsl_extensions;
         for (int n = 0; exts && exts[n]; n++)
             gl_sc_enable_extension(p->sc, (char *)exts[n]);
         p->hwdec_active = true;
@@ -895,20 +912,6 @@ static void init_video(struct gl_video *p)
     gl_video_setup_hooks(p);
 }
 
-// Release any texture mappings associated with the current frame.
-static void unmap_current_image(struct gl_video *p)
-{
-    struct video_image *vimg = &p->image;
-
-    if (vimg->hwdec_mapped) {
-        assert(p->hwdec_active && p->hwdec_mapper);
-        ra_hwdec_mapper_unmap(p->hwdec_mapper);
-        memset(vimg->planes, 0, sizeof(vimg->planes));
-        vimg->hwdec_mapped = false;
-        vimg->id = 0; // needs to be mapped again
-    }
-}
-
 static struct dr_buffer *gl_find_dr_buffer(struct gl_video *p, uint8_t *ptr)
 {
    for (int i = 0; i < p->num_dr_buffers; i++) {
@@ -949,10 +952,18 @@ again:;
 
 static void unref_current_image(struct gl_video *p)
 {
-    unmap_current_image(p);
-    p->image.id = 0;
+    struct video_image *vimg = &p->image;
+
+    if (vimg->hwdec_mapped) {
+        assert(p->hwdec_active && p->hwdec_mapper);
+        ra_hwdec_mapper_unmap(p->hwdec_mapper);
+        memset(vimg->planes, 0, sizeof(vimg->planes));
+        vimg->hwdec_mapped = false;
+    }
+
+    vimg->id = 0;
 
-    mp_image_unrefp(&p->image.mpi);
+    mp_image_unrefp(&vimg->mpi);
 
     // While we're at it, also garbage collect pending fences in here to
     // get it out of the way.
@@ -964,8 +975,8 @@ static void unref_current_image(struct gl_video *p)
 // lead to flickering artifacts.
 static void unmap_overlay(struct gl_video *p)
 {
-    if (p->hwdec_active && p->hwdec->driver->overlay_frame)
-        p->hwdec->driver->overlay_frame(p->hwdec, NULL, NULL, NULL, true);
+    if (p->hwdec_overlay)
+        p->hwdec_overlay->driver->overlay_frame(p->hwdec_overlay, NULL, NULL, NULL, true);
 }
 
 static void uninit_video(struct gl_video *p)
@@ -988,12 +999,13 @@ static void uninit_video(struct gl_video *p)
     p->real_image_params = (struct mp_image_params){0};
     p->image_params = p->real_image_params;
     p->hwdec_active = false;
+    p->hwdec_overlay = NULL;
     ra_hwdec_mapper_free(&p->hwdec_mapper);
 }
 
 static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
 {
-    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
+    if (!p->pass || p->pass_idx == VO_PASS_PERF_MAX)
         return;
 
     struct pass_info *pass = &p->pass[p->pass_idx];
@@ -1008,7 +1020,7 @@ static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
 PRINTF_ATTRIBUTE(2, 3)
 static void pass_describe(struct gl_video *p, const char *textf, ...)
 {
-    if (!p->pass || p->pass_idx == PASS_INFO_MAX)
+    if (!p->pass || p->pass_idx == VO_PASS_PERF_MAX)
         return;
 
     struct pass_info *pass = &p->pass[p->pass_idx];
@@ -1027,7 +1039,7 @@ static void pass_info_reset(struct gl_video *p, bool is_redraw)
     p->pass = is_redraw ? p->pass_redraw : p->pass_fresh;
     p->pass_idx = 0;
 
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
+    for (int i = 0; i < VO_PASS_PERF_MAX; i++) {
         p->pass[i].desc.len = 0;
         p->pass[i].perf = (struct mp_pass_perf){0};
     }
@@ -1038,14 +1050,14 @@ static void pass_report_performance(struct gl_video *p)
     if (!p->pass)
         return;
 
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
+    for (int i = 0; i < VO_PASS_PERF_MAX; i++) {
         struct pass_info *pass = &p->pass[i];
         if (pass->desc.len) {
-            MP_DBG(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
-                   BSTR_P(pass->desc),
-                   (int)pass->perf.last/1000,
-                   (int)pass->perf.avg/1000,
-                   (int)pass->perf.peak/1000);
+            MP_TRACE(p, "pass '%.*s': last %dus avg %dus peak %dus\n",
+                     BSTR_P(pass->desc),
+                     (int)pass->perf.last/1000,
+                     (int)pass->perf.avg/1000,
+                     (int)pass->perf.peak/1000);
         }
     }
 }
@@ -1054,8 +1066,8 @@ static void pass_prepare_src_tex(struct gl_video *p)
 {
     struct gl_shader_cache *sc = p->sc;
 
-    for (int n = 0; n < p->pass_tex_num; n++) {
-        struct img_tex *s = &p->pass_tex[n];
+    for (int n = 0; n < p->num_pass_imgs; n++) {
+        struct image *s = &p->pass_imgs[n];
         if (!s->tex)
             continue;
 
@@ -1079,6 +1091,11 @@ static void pass_prepare_src_tex(struct gl_video *p)
     }
 }
 
+static void cleanup_binds(struct gl_video *p)
+{
+    p->num_pass_imgs = 0;
+}
+
 // Sets the appropriate compute shader metadata for an implicit compute pass
 // bw/bh: block size
 static void pass_is_compute(struct gl_video *p, int bw, int bh)
@@ -1101,7 +1118,6 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
             info.threads_h > 0 ? info.threads_h : info.block_h);
 
     pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
 
     // Since we don't actually have vertices, we pretend for convenience
     // reasons that we do and calculate the right texture coordinates based on
@@ -1109,25 +1125,21 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
     gl_sc_uniform_vec2(p->sc, "out_scale", (float[2]){ 1.0 / w, 1.0 / h });
     PRELUDE("#define outcoord(id) (out_scale * (vec2(id) + vec2(0.5)))\n");
 
-    for (int n = 0; n < TEXUNIT_VIDEO_NUM; n++) {
-        struct img_tex *s = &p->pass_tex[n];
+    for (int n = 0; n < p->num_pass_imgs; n++) {
+        struct image *s = &p->pass_imgs[n];
         if (!s->tex)
             continue;
 
         // We need to rescale the coordinates to the true texture size
-        char tex_scale[32];
-        snprintf(tex_scale, sizeof(tex_scale), "tex_scale%d", n);
+        char *tex_scale = mp_tprintf(32, "tex_scale%d", n);
         gl_sc_uniform_vec2(p->sc, tex_scale, (float[2]){
                 (float)s->w / s->tex->params.w,
                 (float)s->h / s->tex->params.h,
         });
 
-        PRELUDE("#define texcoord%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
-        PRELUDE("#define texcoord%d_rot(id) (texture_rot%d * texcoord%d_raw(id) + "
+        PRELUDE("#define texmap%d_raw(id) (tex_scale%d * outcoord(id))\n", n, n);
+        PRELUDE("#define texmap%d(id) (texture_rot%d * texmap%d_raw(id) + "
                "pixel_size%d * texture_off%d)\n", n, n, n, n, n);
-        // Clamp the texture coordinates to prevent sampling out-of-bounds in
-        // threads that exceed the requested width/height
-        PRELUDE("#define texmap%d(id) min(texcoord%d_rot(id), vec2(1.0))\n", n, n);
         PRELUDE("#define texcoord%d texmap%d(gl_GlobalInvocationID)\n", n, n);
     }
 
@@ -1137,19 +1149,34 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
         num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
 
     pass_record(p, gl_sc_dispatch_compute(p->sc, num_x, num_y, 1));
-
-    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
-    p->pass_tex_num = 0;
+    cleanup_binds(p);
 }
 
 static struct mp_pass_perf render_pass_quad(struct gl_video *p,
-                                            struct fbodst target,
+                                            struct ra_fbo fbo,
                                             const struct mp_rect *dst)
 {
-    struct vertex va[6] = {0};
+    // The first element is reserved for `vec2 position`
+    int num_vertex_attribs = 1 + p->num_pass_imgs;
+    size_t vertex_stride = num_vertex_attribs * sizeof(struct vertex_pt);
+
+    // Expand the VAO if necessary
+    while (p->vao_len < num_vertex_attribs) {
+        MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) {
+            .name = talloc_asprintf(p, "texcoord%d", p->vao_len - 1),
+            .type = RA_VARTYPE_FLOAT,
+            .dim_v = 2,
+            .dim_m = 1,
+            .offset = p->vao_len * sizeof(struct vertex_pt),
+        });
+    }
+
+    int num_vertices = 6; // quad as triangle list
+    int num_attribs_total = num_vertices * num_vertex_attribs;
+    MP_TARRAY_GROW(p, p->tmp_vertex, num_attribs_total);
 
     struct gl_transform t;
-    gl_transform_ortho_fbodst(&t, target);
+    gl_transform_ortho_fbo(&t, fbo);
 
     float x[2] = {dst->x0, dst->x1};
     float y[2] = {dst->y0, dst->y1};
@@ -1157,11 +1184,12 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
     gl_transform_vec(t, &x[1], &y[1]);
 
     for (int n = 0; n < 4; n++) {
-        struct vertex *v = &va[n];
-        v->position.x = x[n / 2];
-        v->position.y = y[n % 2];
-        for (int i = 0; i < p->pass_tex_num; i++) {
-            struct img_tex *s = &p->pass_tex[i];
+        struct vertex_pt *vs = &p->tmp_vertex[num_vertex_attribs * n];
+        // vec2 position in idx 0
+        vs[0].x = x[n / 2];
+        vs[0].y = y[n % 2];
+        for (int i = 0; i < p->num_pass_imgs; i++) {
+            struct image *s = &p->pass_imgs[i];
             if (!s->tex)
                 continue;
             struct gl_transform tr = s->transform;
@@ -1169,43 +1197,48 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
             float ty = (n % 2) * s->h;
             gl_transform_vec(tr, &tx, &ty);
             bool rect = s->tex->params.non_normalized;
-            v->texcoord[i].x = tx / (rect ? 1 : s->tex->params.w);
-            v->texcoord[i].y = ty / (rect ? 1 : s->tex->params.h);
+            // vec2 texcoordN in idx N+1
+            vs[i + 1].x = tx / (rect ? 1 : s->tex->params.w);
+            vs[i + 1].y = ty / (rect ? 1 : s->tex->params.h);
         }
     }
 
-    va[4] = va[2];
-    va[5] = va[1];
+    memmove(&p->tmp_vertex[num_vertex_attribs * 4],
+            &p->tmp_vertex[num_vertex_attribs * 2],
+            vertex_stride);
+
+    memmove(&p->tmp_vertex[num_vertex_attribs * 5],
+            &p->tmp_vertex[num_vertex_attribs * 1],
+            vertex_stride);
 
-    return gl_sc_dispatch_draw(p->sc, target.tex, va, 6);
+    return gl_sc_dispatch_draw(p->sc, fbo.tex, p->vao, num_vertex_attribs,
+                               vertex_stride, p->tmp_vertex, num_vertices);
 }
 
-static void finish_pass_direct(struct gl_video *p, struct fbodst target,
-                               const struct mp_rect *dst)
+static void finish_pass_fbo(struct gl_video *p, struct ra_fbo fbo,
+                            const struct mp_rect *dst)
 {
     pass_prepare_src_tex(p);
-    gl_sc_set_vertex_format(p->sc, vertex_vao, sizeof(struct vertex));
-    pass_record(p, render_pass_quad(p, target, dst));
+    pass_record(p, render_pass_quad(p, fbo, dst));
     debug_check_gl(p, "after rendering");
-    memset(&p->pass_tex, 0, sizeof(p->pass_tex));
-    p->pass_tex_num = 0;
+    cleanup_binds(p);
 }
 
 // dst_fbo: this will be used for rendering; possibly reallocating the whole
 //          FBO, if the required parameters have changed
 // w, h: required FBO target dimension, and also defines the target rectangle
 //       used for rasterization
-// flags: 0 or combination of FBOTEX_FUZZY_W/FBOTEX_FUZZY_H (setting the fuzzy
-//        flags allows the FBO to be larger than the w/h parameters)
-static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
-                            int w, int h, int flags)
+static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
+                            int w, int h)
 {
-    fbotex_change(dst_fbo, p->ra, p->log, w, h, p->fbo_format, flags);
+    if (!ra_tex_resize(p->ra, p->log, dst_tex, w, h, p->fbo_format)) {
+        cleanup_binds(p);
+        gl_sc_reset(p->sc);
+        return;
+    }
 
     if (p->pass_compute.active) {
-        if (!dst_fbo->tex)
-            return;
-        gl_sc_uniform_image2D_wo(p->sc, "out_image", dst_fbo->tex);
+        gl_sc_uniform_image2D_wo(p->sc, "out_image", *dst_tex);
         if (!p->pass_compute.directly_writes)
             GLSL(imageStore(out_image, ivec2(gl_GlobalInvocationID), color);)
 
@@ -1214,11 +1247,12 @@ static void finish_pass_fbo(struct gl_video *p, struct fbotex *dst_fbo,
 
         debug_check_gl(p, "after dispatching compute shader");
     } else {
-        finish_pass_direct(p, dst_fbo->fbo, &(struct mp_rect){0, 0, w, h});
+        struct ra_fbo fbo = { .tex = *dst_tex, };
+        finish_pass_fbo(p, fbo, &(struct mp_rect){0, 0, w, h});
     }
 }
 
-static const char *get_tex_swizzle(struct img_tex *img)
+static const char *get_tex_swizzle(struct image *img)
 {
     if (!img->tex)
         return "rgba";
@@ -1227,7 +1261,7 @@ static const char *get_tex_swizzle(struct img_tex *img)
 
 // Copy a texture to the vec4 color, while increasing offset. Also applies
 // the texture multiplier to the sampled color
-static void copy_img_tex(struct gl_video *p, int *offset, struct img_tex img)
+static void copy_image(struct gl_video *p, int *offset, struct image img)
 {
     int count = img.components;
     assert(*offset + count <= 4);
@@ -1261,14 +1295,14 @@ static void skip_unused(struct gl_video *p, int num_components)
 
 static void uninit_scaler(struct gl_video *p, struct scaler *scaler)
 {
-    fbotex_uninit(&scaler->sep_fbo);
+    ra_tex_free(p->ra, &scaler->sep_fbo);
     ra_tex_free(p->ra, &scaler->lut);
     scaler->kernel = NULL;
     scaler->initialized = false;
 }
 
 static void hook_prelude(struct gl_video *p, const char *name, int id,
-                         struct img_tex tex)
+                         struct image img)
 {
     GLSLHF("#define %s_raw texture%d\n", name, id);
     GLSLHF("#define %s_pos texcoord%d\n", name, id);
@@ -1276,15 +1310,15 @@ static void hook_prelude(struct gl_video *p, const char *name, int id,
     GLSLHF("#define %s_rot texture_rot%d\n", name, id);
     GLSLHF("#define %s_pt pixel_size%d\n", name, id);
     GLSLHF("#define %s_map texmap%d\n", name, id);
-    GLSLHF("#define %s_mul %f\n", name, tex.multiplier);
+    GLSLHF("#define %s_mul %f\n", name, img.multiplier);
 
     // Set up the sampling functions
     GLSLHF("#define %s_tex(pos) (%s_mul * vec4(texture(%s_raw, pos)).%s)\n",
-           name, name, name, get_tex_swizzle(&tex));
+           name, name, name, get_tex_swizzle(&img));
 
     // Since the extra matrix multiplication impacts performance,
     // skip it unless the texture was actually rotated
-    if (gl_transform_eq(tex.transform, identity_trans)) {
+    if (gl_transform_eq(img.transform, identity_trans)) {
         GLSLHF("#define %s_texOff(off) %s_tex(%s_pos + %s_pt * vec2(off))\n",
                name, name, name, name);
     } else {
@@ -1294,15 +1328,15 @@ static void hook_prelude(struct gl_video *p, const char *name, int id,
     }
 }
 
-static bool saved_tex_find(struct gl_video *p, const char *name,
-                           struct img_tex *out)
+static bool saved_img_find(struct gl_video *p, const char *name,
+                           struct image *out)
 {
     if (!name || !out)
         return false;
 
-    for (int i = 0; i < p->saved_tex_num; i++) {
-        if (strcmp(p->saved_tex[i].name, name) == 0) {
-            *out = p->saved_tex[i].tex;
+    for (int i = 0; i < p->num_saved_imgs; i++) {
+        if (strcmp(p->saved_imgs[i].name, name) == 0) {
+            *out = p->saved_imgs[i].img;
             return true;
         }
     }
@@ -1310,29 +1344,28 @@ static bool saved_tex_find(struct gl_video *p, const char *name,
     return false;
 }
 
-static void saved_tex_store(struct gl_video *p, const char *name,
-                            struct img_tex tex)
+static void saved_img_store(struct gl_video *p, const char *name,
+                            struct image img)
 {
     assert(name);
 
-    for (int i = 0; i < p->saved_tex_num; i++) {
-        if (strcmp(p->saved_tex[i].name, name) == 0) {
-            p->saved_tex[i].tex = tex;
+    for (int i = 0; i < p->num_saved_imgs; i++) {
+        if (strcmp(p->saved_imgs[i].name, name) == 0) {
+            p->saved_imgs[i].img = img;
             return;
         }
     }
 
-    assert(p->saved_tex_num < SHADER_MAX_SAVED);
-    p->saved_tex[p->saved_tex_num++] = (struct saved_tex) {
+    MP_TARRAY_APPEND(p, p->saved_imgs, p->num_saved_imgs, (struct saved_img) {
         .name = name,
-        .tex = tex
-    };
+        .img = img
+    });
 }
 
 static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
-                                  struct img_tex tex, struct tex_hook *hook)
+                                  struct image img, struct tex_hook *hook)
 {
-    for (int t = 0; t < TEXUNIT_VIDEO_NUM; t++) {
+    for (int t = 0; t < SHADER_MAX_BINDS; t++) {
         char *bind_name = (char *)hook->bind_tex[t];
 
         if (!bind_name)
@@ -1340,16 +1373,16 @@ static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
 
         // This is a special name that means "currently hooked texture"
         if (strcmp(bind_name, "HOOKED") == 0) {
-            int id = pass_bind(p, tex);
-            hook_prelude(p, "HOOKED", id, tex);
-            hook_prelude(p, name, id, tex);
+            int id = pass_bind(p, img);
+            hook_prelude(p, "HOOKED", id, img);
+            hook_prelude(p, name, id, img);
             continue;
         }
 
         // BIND can also be used to load user-defined textures, in which
         // case we will directly load them as a uniform instead of
         // generating the hook_prelude boilerplate
-        for (int u = 0; u < p->user_tex_num; u++) {
+        for (int u = 0; u < p->num_user_textures; u++) {
             struct gl_user_shader_tex *utex = &p->user_textures[u];
             if (bstr_equals0(utex->name, bind_name)) {
                 gl_sc_uniform_texture(p->sc, bind_name, utex->tex);
@@ -1357,16 +1390,16 @@ static bool pass_hook_setup_binds(struct gl_video *p, const char *name,
             }
         }
 
-        struct img_tex bind_tex;
-        if (!saved_tex_find(p, bind_name, &bind_tex)) {
+        struct image bind_img;
+        if (!saved_img_find(p, bind_name, &bind_img)) {
             // Clean up texture bindings and move on to the next hook
-            MP_DBG(p, "Skipping hook on %s due to no texture named %s.\n",
-                   name, bind_name);
-            p->pass_tex_num -= t;
+            MP_TRACE(p, "Skipping hook on %s due to no texture named %s.\n",
+                     name, bind_name);
+            p->num_pass_imgs -= t;
             return false;
         }
 
-        hook_prelude(p, bind_name, pass_bind(p, bind_tex), bind_tex);
+        hook_prelude(p, bind_name, pass_bind(p, bind_img), bind_img);
 
 next_bind: ;
     }
@@ -1374,18 +1407,26 @@ next_bind: ;
     return true;
 }
 
-// Process hooks for a plane, saving the result and returning a new img_tex
-// If 'trans' is NULL, the shader is forbidden from transforming tex
-static struct img_tex pass_hook(struct gl_video *p, const char *name,
-                                struct img_tex tex, struct gl_transform *trans)
+static struct ra_tex **next_hook_tex(struct gl_video *p)
+{
+    if (p->idx_hook_textures == p->num_hook_textures)
+        MP_TARRAY_APPEND(p, p->hook_textures, p->num_hook_textures, NULL);
+
+    return &p->hook_textures[p->idx_hook_textures++];
+}
+
+// Process hooks for a plane, saving the result and returning a new image
+// If 'trans' is NULL, the shader is forbidden from transforming img
+static struct image pass_hook(struct gl_video *p, const char *name,
+                              struct image img, struct gl_transform *trans)
 {
     if (!name)
-        return tex;
+        return img;
 
-    saved_tex_store(p, name, tex);
+    saved_img_store(p, name, img);
 
-    MP_DBG(p, "Running hooks for %s\n", name);
-    for (int i = 0; i < p->tex_hook_num; i++) {
+    MP_TRACE(p, "Running hooks for %s\n", name);
+    for (int i = 0; i < p->num_tex_hooks; i++) {
         struct tex_hook *hook = &p->tex_hooks[i];
 
         // Figure out if this pass hooks this texture
@@ -1398,34 +1439,32 @@ static struct img_tex pass_hook(struct gl_video *p, const char *name,
 
 found:
         // Check the hook's condition
-        if (hook->cond && !hook->cond(p, tex, hook->priv)) {
-            MP_DBG(p, "Skipping hook on %s due to condition.\n", name);
+        if (hook->cond && !hook->cond(p, img, hook->priv)) {
+            MP_TRACE(p, "Skipping hook on %s due to condition.\n", name);
             continue;
         }
 
-        if (!pass_hook_setup_binds(p, name, tex, hook))
+        if (!pass_hook_setup_binds(p, name, img, hook))
             continue;
 
         // Run the actual hook. This generates a series of GLSL shader
         // instructions sufficient for drawing the hook's output
         struct gl_transform hook_off = identity_trans;
-        hook->hook(p, tex, &hook_off, hook->priv);
+        hook->hook(p, img, &hook_off, hook->priv);
 
-        int comps = hook->components ? hook->components : tex.components;
+        int comps = hook->components ? hook->components : img.components;
         skip_unused(p, comps);
 
         // Compute the updated FBO dimensions and store the result
-        struct mp_rect_f sz = {0, 0, tex.w, tex.h};
+        struct mp_rect_f sz = {0, 0, img.w, img.h};
         gl_transform_rect(hook_off, &sz);
         int w = lroundf(fabs(sz.x1 - sz.x0));
         int h = lroundf(fabs(sz.y1 - sz.y0));
 
-        assert(p->hook_fbo_num < SHADER_MAX_SAVED);
-        struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
-        finish_pass_fbo(p, fbo, w, h, 0);
-
+        struct ra_tex **tex = next_hook_tex(p);
+        finish_pass_tex(p, tex, w, h);
         const char *store_name = hook->save_tex ? hook->save_tex : name;
-        struct img_tex saved_tex = img_tex_fbo(fbo, tex.type, comps);
+        struct image saved_img = image_wrap(*tex, img.type, comps);
 
         // If the texture we're saving overwrites the "current" texture, also
         // update the tex parameter so that the future loop cycles will use the
@@ -1434,18 +1473,18 @@ found:
             if (!trans && !gl_transform_eq(hook_off, identity_trans)) {
                 MP_ERR(p, "Hook tried changing size of unscalable texture %s!\n",
                        name);
-                return tex;
+                return img;
             }
 
-            tex = saved_tex;
+            img = saved_img;
             if (trans)
                 gl_transform_trans(hook_off, trans);
         }
 
-        saved_tex_store(p, store_name, saved_tex);
+        saved_img_store(p, store_name, saved_img);
     }
 
-    return tex;
+    return img;
 }
 
 // This can be used at any time in the middle of rendering to specify an
@@ -1459,7 +1498,7 @@ static void pass_opt_hook_point(struct gl_video *p, const char *name,
     if (!name)
         return;
 
-    for (int i = 0; i < p->tex_hook_num; i++) {
+    for (int i = 0; i < p->num_tex_hooks; i++) {
         struct tex_hook *hook = &p->tex_hooks[i];
 
         for (int h = 0; h < SHADER_MAX_HOOKS; h++) {
@@ -1467,7 +1506,7 @@ static void pass_opt_hook_point(struct gl_video *p, const char *name,
                 goto found;
         }
 
-        for (int b = 0; b < TEXUNIT_VIDEO_NUM; b++) {
+        for (int b = 0; b < SHADER_MAX_BINDS; b++) {
             if (hook->bind_tex[b] && strcmp(hook->bind_tex[b], name) == 0)
                 goto found;
         }
@@ -1476,14 +1515,12 @@ static void pass_opt_hook_point(struct gl_video *p, const char *name,
     // Nothing uses this texture, don't bother storing it
     return;
 
-found:
-    assert(p->hook_fbo_num < SHADER_MAX_SAVED);
-    struct fbotex *fbo = &p->hook_fbos[p->hook_fbo_num++];
-    finish_pass_fbo(p, fbo, p->texture_w, p->texture_h, 0);
-
-    struct img_tex img = img_tex_fbo(fbo, PLANE_RGB, p->components);
+found: ;
+    struct ra_tex **tex = next_hook_tex(p);
+    finish_pass_tex(p, tex, p->texture_w, p->texture_h);
+    struct image img = image_wrap(*tex, PLANE_RGB, p->components);
     img = pass_hook(p, name, img, tex_trans);
-    copy_img_tex(p, &(int){0}, img);
+    copy_image(p, &(int){0}, img);
     p->texture_w = img.w;
     p->texture_h = img.h;
     p->components = img.components;
@@ -1493,7 +1530,9 @@ found:
 static void load_shader(struct gl_video *p, struct bstr body)
 {
     gl_sc_hadd_bstr(p->sc, body);
+    gl_sc_uniform_dynamic(p->sc);
     gl_sc_uniform_f(p->sc, "random", (double)av_lfg_get(&p->lfg) / UINT32_MAX);
+    gl_sc_uniform_dynamic(p->sc);
     gl_sc_uniform_i(p->sc, "frame", p->frames_uploaded);
     gl_sc_uniform_vec2(p->sc, "input_size",
                        (float[]){(p->src_rect.x1 - p->src_rect.x0) *
@@ -1631,7 +1670,7 @@ static void reinit_scaler(struct gl_video *p, struct scaler *scaler,
 }
 
 // Special helper for sampling from two separated stages
-static void pass_sample_separated(struct gl_video *p, struct img_tex src,
+static void pass_sample_separated(struct gl_video *p, struct image src,
                                   struct scaler *scaler, int w, int h)
 {
     // Separate the transformation into x and y components, per pass
@@ -1650,10 +1689,10 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
     GLSLF("// first pass\n");
     pass_sample_separated_gen(p->sc, scaler, 0, 1);
     GLSLF("color *= %f;\n", src.multiplier);
-    finish_pass_fbo(p, &scaler->sep_fbo, src.w, h, FBOTEX_FUZZY_H);
+    finish_pass_tex(p, &scaler->sep_fbo, src.w, h);
 
     // Second pass (scale only in the x dir)
-    src = img_tex_fbo(&scaler->sep_fbo, src.type, src.components);
+    src = image_wrap(scaler->sep_fbo, src.type, src.components);
     src.transform = t_x;
     pass_describe(p, "%s second pass", scaler->conf.kernel.name);
     sampler_prelude(p->sc, pass_bind(p, src));
@@ -1663,9 +1702,9 @@ static void pass_sample_separated(struct gl_video *p, struct img_tex src,
 // Picks either the compute shader version or the regular sampler version
 // depending on hardware support
 static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler,
-                                       struct img_tex tex, int w, int h)
+                                       struct image img, int w, int h)
 {
-    uint64_t reqs = RA_CAP_COMPUTE | RA_CAP_NESTED_ARRAY;
+    uint64_t reqs = RA_CAP_COMPUTE;
     if ((p->ra->caps & reqs) != reqs)
         goto fallback;
 
@@ -1673,8 +1712,8 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler
     int offset = bound - 1; // padding top/left
     int padding = offset + bound; // total padding
 
-    float ratiox = (float)w / tex.w,
-          ratioy = (float)h / tex.h;
+    float ratiox = (float)w / img.w,
+          ratioy = (float)h / img.h;
 
     // For performance we want to load at least as many pixels
     // horizontally as there are threads in a warp (32 for nvidia), as
@@ -1688,27 +1727,28 @@ static void pass_dispatch_sample_polar(struct gl_video *p, struct scaler *scaler
     int iw = (int)ceil(bw / ratiox) + padding + 1,
         ih = (int)ceil(bh / ratioy) + padding + 1;
 
-    int shmem_req = iw * ih * tex.components * sizeof(float);
+    int shmem_req = iw * ih * img.components * sizeof(float);
     if (shmem_req > p->ra->max_shmem)
         goto fallback;
 
     pass_is_compute(p, bw, bh);
-    pass_compute_polar(p->sc, scaler, tex.components, bw, bh, iw, ih);
+    pass_compute_polar(p->sc, scaler, img.components, bw, bh, iw, ih);
     return;
 
 fallback:
     // Fall back to regular polar shader when compute shaders are unsupported
     // or the kernel is too big for shmem
-    pass_sample_polar(p->sc, scaler, tex.components, p->ra->glsl_version);
+    pass_sample_polar(p->sc, scaler, img.components,
+                      p->ra->caps & RA_CAP_GATHER);
 }
 
-// Sample from img_tex, with the src rectangle given by it.
+// Sample from image, with the src rectangle given by it.
 // The dst rectangle is implicit by what the caller will do next, but w and h
 // must still be what is going to be used (to dimension FBOs correctly).
 // This will write the scaled contents to the vec4 "color".
 // The scaler unit is initialized by this function; in order to avoid cache
 // thrashing, the scaler unit should usually use the same parameters.
-static void pass_sample(struct gl_video *p, struct img_tex tex,
+static void pass_sample(struct gl_video *p, struct image img,
                         struct scaler *scaler, const struct scaler_config *conf,
                         double scale_factor, int w, int h)
 {
@@ -1723,14 +1763,14 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     };
 
     pass_describe(p, "%s=%s (%s)", scaler_opt[scaler->index],
-                  scaler->conf.kernel.name, plane_names[tex.type]);
+                  scaler->conf.kernel.name, plane_names[img.type]);
 
     bool is_separated = scaler->kernel && !scaler->kernel->polar;
 
     // Set up the transformation+prelude and bind the texture, for everything
     // other than separated scaling (which does this in the subfunction)
     if (!is_separated)
-        sampler_prelude(p->sc, pass_bind(p, tex));
+        sampler_prelude(p->sc, pass_bind(p, img));
 
     // Dispatch the scaler. They're all wildly different.
     const char *name = scaler->conf.kernel.name;
@@ -1741,9 +1781,9 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     } else if (strcmp(name, "oversample") == 0) {
         pass_sample_oversample(p->sc, scaler, w, h);
     } else if (scaler->kernel && scaler->kernel->polar) {
-        pass_dispatch_sample_polar(p, scaler, tex, w, h);
+        pass_dispatch_sample_polar(p, scaler, img, w, h);
     } else if (scaler->kernel) {
-        pass_sample_separated(p, tex, scaler, w, h);
+        pass_sample_separated(p, img, scaler, w, h);
     } else {
         // Should never happen
         abort();
@@ -1752,14 +1792,14 @@ static void pass_sample(struct gl_video *p, struct img_tex tex,
     // Apply any required multipliers. Separated scaling already does this in
     // its first stage
     if (!is_separated)
-        GLSLF("color *= %f;\n", tex.multiplier);
+        GLSLF("color *= %f;\n", img.multiplier);
 
     // Micro-optimization: Avoid scaling unneeded channels
-    skip_unused(p, tex.components);
+    skip_unused(p, img.components);
 }
 
-// Returns true if two img_texs are semantically equivalent (same metadata)
-static bool img_tex_equiv(struct img_tex a, struct img_tex b)
+// Returns true if two images are semantically equivalent (same metadata)
+static bool image_equiv(struct image a, struct image b)
 {
     return a.type == b.type &&
            a.components == b.components &&
@@ -1772,27 +1812,15 @@ static bool img_tex_equiv(struct img_tex a, struct img_tex b)
            gl_transform_eq(a.transform, b.transform);
 }
 
-static bool add_hook(struct gl_video *p, struct tex_hook hook)
-{
-    if (p->tex_hook_num < SHADER_MAX_PASSES) {
-        p->tex_hooks[p->tex_hook_num++] = hook;
-        return true;
-    } else {
-        MP_ERR(p, "Too many passes! Limit is %d.\n", SHADER_MAX_PASSES);
-        talloc_free(hook.priv);
-        return false;
-    }
-}
-
-static void deband_hook(struct gl_video *p, struct img_tex tex,
+static void deband_hook(struct gl_video *p, struct image img,
                         struct gl_transform *trans, void *priv)
 {
-    pass_describe(p, "debanding (%s)", plane_names[tex.type]);
+    pass_describe(p, "debanding (%s)", plane_names[img.type]);
     pass_sample_deband(p->sc, p->opts.deband_opts, &p->lfg,
                        p->image_params.color.gamma);
 }
 
-static void unsharp_hook(struct gl_video *p, struct img_tex tex,
+static void unsharp_hook(struct gl_video *p, struct image img,
                          struct gl_transform *trans, void *priv)
 {
     pass_describe(p, "unsharp masking");
@@ -1801,7 +1829,7 @@ static void unsharp_hook(struct gl_video *p, struct img_tex tex,
 
 struct szexp_ctx {
     struct gl_video *p;
-    struct img_tex tex;
+    struct image img;
 };
 
 static bool szexp_lookup(void *priv, struct bstr var, float size[2])
@@ -1825,15 +1853,15 @@ static bool szexp_lookup(void *priv, struct bstr var, float size[2])
 
     // HOOKED is a special case
     if (bstr_equals0(var, "HOOKED")) {
-        size[0] = ctx->tex.w;
-        size[1] = ctx->tex.h;
+        size[0] = ctx->img.w;
+        size[1] = ctx->img.h;
         return true;
     }
 
-    for (int o = 0; o < p->saved_tex_num; o++) {
-        if (bstr_equals0(var, p->saved_tex[o].name)) {
-            size[0] = p->saved_tex[o].tex.w;
-            size[1] = p->saved_tex[o].tex.h;
+    for (int o = 0; o < p->num_saved_imgs; o++) {
+        if (bstr_equals0(var, p->saved_imgs[o].name)) {
+            size[0] = p->saved_imgs[o].img.w;
+            size[1] = p->saved_imgs[o].img.h;
             return true;
         }
     }
@@ -1841,17 +1869,18 @@ static bool szexp_lookup(void *priv, struct bstr var, float size[2])
     return false;
 }
 
-static bool user_hook_cond(struct gl_video *p, struct img_tex tex, void *priv)
+static bool user_hook_cond(struct gl_video *p, struct image img, void *priv)
 {
     struct gl_user_shader_hook *shader = priv;
     assert(shader);
 
     float res = false;
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->cond, &res);
+    struct szexp_ctx ctx = {p, img};
+    eval_szexpr(p->log, &ctx, szexp_lookup, shader->cond, &res);
     return res;
 }
 
-static void user_hook(struct gl_video *p, struct img_tex tex,
+static void user_hook(struct gl_video *p, struct image img,
                       struct gl_transform *trans, void *priv)
 {
     struct gl_user_shader_hook *shader = priv;
@@ -1859,7 +1888,7 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
     load_shader(p, shader->pass_body);
 
     pass_describe(p, "user shader: %.*s (%s)", BSTR_P(shader->pass_desc),
-                  plane_names[tex.type]);
+                  plane_names[img.type]);
 
     if (shader->compute.active) {
         p->pass_compute = shader->compute;
@@ -1872,10 +1901,10 @@ static void user_hook(struct gl_video *p, struct img_tex tex,
     // to do this and display an error message than just crash OpenGL
     float w = 1.0, h = 1.0;
 
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->width, &w);
-    eval_szexpr(p->log, &(struct szexp_ctx){p, tex}, szexp_lookup, shader->height, &h);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, img}, szexp_lookup, shader->width, &w);
+    eval_szexpr(p->log, &(struct szexp_ctx){p, img}, szexp_lookup, shader->height, &h);
 
-    *trans = (struct gl_transform){{{w / tex.w, 0}, {0, h / tex.h}}};
+    *trans = (struct gl_transform){{{w / img.w, 0}, {0, h / img.h}}};
     gl_transform_trans(shader->offset, trans);
 }
 
@@ -1898,27 +1927,22 @@ static bool add_user_hook(void *priv, struct gl_user_shader_hook hook)
     for (int h = 0; h < SHADER_MAX_BINDS; h++)
         texhook.bind_tex[h] = bstrdup0(copy, hook.bind_tex[h]);
 
-    return add_hook(p, texhook);
+    MP_TARRAY_APPEND(p, p->tex_hooks, p->num_tex_hooks, texhook);
+    return true;
 }
 
 static bool add_user_tex(void *priv, struct gl_user_shader_tex tex)
 {
     struct gl_video *p = priv;
 
-    if (p->user_tex_num == SHADER_MAX_PASSES) {
-        MP_ERR(p, "Too many textures! Limit is %d.\n", SHADER_MAX_PASSES);
-        goto err;
-    }
-
     tex.tex = ra_tex_create(p->ra, &tex.params);
     TA_FREEP(&tex.params.initial_data);
 
-    p->user_textures[p->user_tex_num++] = tex;
-    return true;
+    if (!tex.tex)
+        return false;
 
-err:
-    talloc_free(tex.params.initial_data);
-    return false;
+    MP_TARRAY_APPEND(p, p->user_textures, p->num_user_textures, tex);
+    return true;
 }
 
 static void load_user_shaders(struct gl_video *p, char **shaders)
@@ -1937,7 +1961,7 @@ static void gl_video_setup_hooks(struct gl_video *p)
     gl_video_reset_hooks(p);
 
     if (p->opts.deband) {
-        add_hook(p, (struct tex_hook) {
+        MP_TARRAY_APPEND(p, p->tex_hooks, p->num_tex_hooks, (struct tex_hook) {
             .hook_tex = {"LUMA", "CHROMA", "RGB", "XYZ"},
             .bind_tex = {"HOOKED"},
             .hook = deband_hook,
@@ -1945,7 +1969,7 @@ static void gl_video_setup_hooks(struct gl_video *p)
     }
 
     if (p->opts.unsharp != 0.0) {
-        add_hook(p, (struct tex_hook) {
+        MP_TARRAY_APPEND(p, p->tex_hooks, p->num_tex_hooks, (struct tex_hook) {
             .hook_tex = {"MAIN"},
             .bind_tex = {"HOOKED"},
             .hook = unsharp_hook,
@@ -1958,55 +1982,55 @@ static void gl_video_setup_hooks(struct gl_video *p)
 // sample from video textures, set "color" variable to yuv value
 static void pass_read_video(struct gl_video *p)
 {
-    struct img_tex tex[4];
+    struct image img[4];
     struct gl_transform offsets[4];
-    pass_get_img_tex(p, &p->image, tex, offsets);
+    pass_get_images(p, &p->image, img, offsets);
 
     // To keep the code as simple as possibly, we currently run all shader
     // stages even if they would be unnecessary (e.g. no hooks for a texture).
-    // In the future, deferred img_tex should optimize this away.
+    // In the future, deferred image should optimize this away.
 
     // Merge semantically identical textures. This loop is done from back
     // to front so that merged textures end up in the right order while
     // simultaneously allowing us to skip unnecessary merges
     for (int n = 3; n >= 0; n--) {
-        if (tex[n].type == PLANE_NONE)
+        if (img[n].type == PLANE_NONE)
             continue;
 
         int first = n;
         int num = 0;
 
         for (int i = 0; i < n; i++) {
-            if (img_tex_equiv(tex[n], tex[i]) &&
+            if (image_equiv(img[n], img[i]) &&
                 gl_transform_eq(offsets[n], offsets[i]))
             {
                 GLSLF("// merging plane %d ...\n", i);
-                copy_img_tex(p, &num, tex[i]);
+                copy_image(p, &num, img[i]);
                 first = MPMIN(first, i);
-                tex[i] = (struct img_tex){0};
+                img[i] = (struct image){0};
             }
         }
 
         if (num > 0) {
             GLSLF("// merging plane %d ... into %d\n", n, first);
-            copy_img_tex(p, &num, tex[n]);
+            copy_image(p, &num, img[n]);
             pass_describe(p, "merging planes");
-            finish_pass_fbo(p, &p->merge_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[first] = img_tex_fbo(&p->merge_fbo[n], tex[n].type, num);
-            tex[n] = (struct img_tex){0};
+            finish_pass_tex(p, &p->merge_tex[n], img[n].w, img[n].h);
+            img[first] = image_wrap(p->merge_tex[n], img[n].type, num);
+            img[n] = (struct image){0};
         }
     }
 
     // If any textures are still in integer format by this point, we need
     // to introduce an explicit conversion pass to avoid breaking hooks/scaling
     for (int n = 0; n < 4; n++) {
-        if (tex[n].tex && tex[n].tex->params.format->ctype == RA_CTYPE_UINT) {
+        if (img[n].tex && img[n].tex->params.format->ctype == RA_CTYPE_UINT) {
             GLSLF("// use_integer fix for plane %d\n", n);
-            copy_img_tex(p, &(int){0}, tex[n]);
+            copy_image(p, &(int){0}, img[n]);
             pass_describe(p, "use_integer fix");
-            finish_pass_fbo(p, &p->integer_fbo[n], tex[n].w, tex[n].h, 0);
-            tex[n] = img_tex_fbo(&p->integer_fbo[n], tex[n].type,
-                                 tex[n].components);
+            finish_pass_tex(p, &p->integer_tex[n], img[n].w, img[n].h);
+            img[n] = image_wrap(p->integer_tex[n], img[n].type,
+                                img[n].components);
         }
     }
 
@@ -2014,7 +2038,7 @@ static void pass_read_video(struct gl_video *p)
     // modifying them in the process
     for (int n = 0; n < 4; n++) {
         const char *name;
-        switch (tex[n].type) {
+        switch (img[n].type) {
         case PLANE_RGB:    name = "RGB";    break;
         case PLANE_LUMA:   name = "LUMA";   break;
         case PLANE_CHROMA: name = "CHROMA"; break;
@@ -2023,7 +2047,7 @@ static void pass_read_video(struct gl_video *p)
         default: continue;
         }
 
-        tex[n] = pass_hook(p, name, tex[n], &offsets[n]);
+        img[n] = pass_hook(p, name, img[n], &offsets[n]);
     }
 
     // At this point all planes are finalized but they may not be at the
@@ -2032,15 +2056,15 @@ static void pass_read_video(struct gl_video *p)
     // the rgb/luma texture is the "reference" and scale everything else
     // to match.
     for (int n = 0; n < 4; n++) {
-        switch (tex[n].type) {
+        switch (img[n].type) {
         case PLANE_RGB:
         case PLANE_XYZ:
         case PLANE_LUMA: break;
         default: continue;
         }
 
-        p->texture_w = tex[n].w;
-        p->texture_h = tex[n].h;
+        p->texture_w = img[n].w;
+        p->texture_h = img[n].h;
         p->texture_offset = offsets[n];
         break;
     }
@@ -2049,20 +2073,16 @@ static void pass_read_video(struct gl_video *p)
     struct mp_rect_f src = {0.0, 0.0, p->image_params.w, p->image_params.h};
     struct mp_rect_f ref = src;
     gl_transform_rect(p->texture_offset, &ref);
-    MP_DBG(p, "ref rect: {%f %f} {%f %f}\n", ref.x0, ref.y0, ref.x1, ref.y1);
 
     // Explicitly scale all of the textures that don't match
     for (int n = 0; n < 4; n++) {
-        if (tex[n].type == PLANE_NONE)
+        if (img[n].type == PLANE_NONE)
             continue;
 
         // If the planes are aligned identically, we will end up with the
         // exact same source rectangle.
         struct mp_rect_f rect = src;
         gl_transform_rect(offsets[n], &rect);
-        MP_DBG(p, "rect[%d]: {%f %f} {%f %f}\n", n,
-               rect.x0, rect.y0, rect.x1, rect.y1);
-
         if (mp_rect_f_seq(ref, rect))
             continue;
 
@@ -2074,23 +2094,19 @@ static void pass_read_video(struct gl_video *p)
                   {0.0, (ref.y1 - ref.y0) / (rect.y1 - rect.y0)}},
             .t = {ref.x0, ref.y0},
         };
-        MP_DBG(p, "-> fix[%d] = {%f %f} + off {%f %f}\n", n,
-               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
 
         // Since the scale in texture space is different from the scale in
         // absolute terms, we have to scale the coefficients down to be
         // relative to the texture's physical dimensions and local offset
         struct gl_transform scale = {
-            .m = {{(float)tex[n].w / p->texture_w, 0.0},
-                  {0.0, (float)tex[n].h / p->texture_h}},
+            .m = {{(float)img[n].w / p->texture_w, 0.0},
+                  {0.0, (float)img[n].h / p->texture_h}},
             .t = {-rect.x0, -rect.y0},
         };
         if (p->image_params.rotate % 180 == 90)
             MPSWAP(double, scale.m[0][0], scale.m[1][1]);
 
         gl_transform_trans(scale, &fix);
-        MP_DBG(p, "-> scaled[%d] = {%f %f} + off {%f %f}\n", n,
-               fix.m[0][0], fix.m[1][1], fix.t[0], fix.t[1]);
 
         // Since the texture transform is a function of the texture coordinates
         // to texture space, rather than the other way around, we have to
@@ -2100,11 +2116,11 @@ static void pass_read_video(struct gl_video *p)
         fix.m[1][1] = 1.0 / fix.m[1][1];
         fix.t[0] = fix.m[0][0] * -fix.t[0];
         fix.t[1] = fix.m[1][1] * -fix.t[1];
-        gl_transform_trans(fix, &tex[n].transform);
+        gl_transform_trans(fix, &img[n].transform);
 
         int scaler_id = -1;
         const char *name = NULL;
-        switch (tex[n].type) {
+        switch (img[n].type) {
         case PLANE_RGB:
         case PLANE_LUMA:
         case PLANE_XYZ:
@@ -2129,31 +2145,31 @@ static void pass_read_video(struct gl_video *p)
         // bilinear scaling is a free no-op thanks to GPU sampling
         if (strcmp(conf->kernel.name, "bilinear") != 0) {
             GLSLF("// upscaling plane %d\n", n);
-            pass_sample(p, tex[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
-            finish_pass_fbo(p, &p->scale_fbo[n], p->texture_w, p->texture_h, 0);
-            tex[n] = img_tex_fbo(&p->scale_fbo[n], tex[n].type, tex[n].components);
+            pass_sample(p, img[n], scaler, conf, 1.0, p->texture_w, p->texture_h);
+            finish_pass_tex(p, &p->scale_tex[n], p->texture_w, p->texture_h);
+            img[n] = image_wrap(p->scale_tex[n], img[n].type, img[n].components);
         }
 
         // Run any post-scaling hooks
-        tex[n] = pass_hook(p, name, tex[n], NULL);
+        img[n] = pass_hook(p, name, img[n], NULL);
     }
 
     // All planes are of the same size and properly aligned at this point
-    GLSLF("// combining planes\n");
+    pass_describe(p, "combining planes");
     int coord = 0;
     for (int i = 0; i < 4; i++) {
-        if (tex[i].type != PLANE_NONE)
-            copy_img_tex(p, &coord, tex[i]);
+        if (img[i].type != PLANE_NONE)
+            copy_image(p, &coord, img[i]);
     }
     p->components = coord;
 }
 
-// Utility function that simply binds an FBO and reads from it, without any
+// Utility function that simply binds a texture and reads from it, without any
 // transformations.
-static void pass_read_fbo(struct gl_video *p, struct fbotex *fbo)
+static void pass_read_tex(struct gl_video *p, struct ra_tex *tex)
 {
-    struct img_tex tex = img_tex_fbo(fbo, PLANE_RGB, p->components);
-    copy_img_tex(p, &(int){0}, tex);
+    struct image img = image_wrap(tex, PLANE_RGB, p->components);
+    copy_image(p, &(int){0}, img);
 }
 
 // yuv conversion, and any other conversions before main up/down-scaling
@@ -2335,8 +2351,8 @@ static void pass_scale_main(struct gl_video *p)
     compute_src_transform(p, &transform);
 
     GLSLF("// main scaling\n");
-    finish_pass_fbo(p, &p->indirect_fbo, p->texture_w, p->texture_h, 0);
-    struct img_tex src = img_tex_fbo(&p->indirect_fbo, PLANE_RGB, p->components);
+    finish_pass_tex(p, &p->indirect_tex, p->texture_w, p->texture_h);
+    struct image src = image_wrap(p->indirect_tex, PLANE_RGB, p->components);
     gl_transform_trans(transform, &src.transform);
     pass_sample(p, src, scaler, &scaler_conf, scale_factor, vp_w, vp_h);
 
@@ -2571,6 +2587,7 @@ static void pass_dither(struct gl_video *p)
 
         float matrix[2][2] = {{cos(r),     -sin(r)    },
                               {sin(r) * m,  cos(r) * m}};
+        gl_sc_uniform_dynamic(p->sc);
         gl_sc_uniform_mat2(p->sc, "dither_trafo", true, &matrix[0][0]);
 
         GLSL(dither_pos = dither_trafo * dither_pos;)
@@ -2584,7 +2601,7 @@ static void pass_dither(struct gl_video *p)
 // Draws the OSD, in scene-referred colors.. If cms is true, subtitles are
 // instead adapted to the display's gamut.
 static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
-                          struct mp_osd_res rect, struct fbodst target, bool cms)
+                          struct mp_osd_res rect, struct ra_fbo fbo, bool cms)
 {
     mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo_out, draw_flags);
 
@@ -2604,7 +2621,7 @@ static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
 
             pass_colormanage(p, csp_srgb, true);
         }
-        mpgl_osd_draw_finish(p->osd, n, p->sc, target);
+        mpgl_osd_draw_finish(p->osd, n, p->sc, fbo);
     }
 
     timer_pool_stop(p->osd_timer);
@@ -2620,17 +2637,17 @@ static float chroma_realign(int size, int pixel)
 // Minimal rendering code path, for GLES or OpenGL 2.1 without proper FBOs.
 static void pass_render_frame_dumb(struct gl_video *p)
 {
-    struct img_tex tex[4];
+    struct image img[4];
     struct gl_transform off[4];
-    pass_get_img_tex(p, &p->image, tex, off);
+    pass_get_images(p, &p->image, img, off);
 
     struct gl_transform transform;
     compute_src_transform(p, &transform);
 
     int index = 0;
     for (int i = 0; i < p->plane_count; i++) {
-        int cw = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_w : 1;
-        int ch = tex[i].type == PLANE_CHROMA ? p->ra_format.chroma_h : 1;
+        int cw = img[i].type == PLANE_CHROMA ? p->ra_format.chroma_w : 1;
+        int ch = img[i].type == PLANE_CHROMA ? p->ra_format.chroma_h : 1;
         if (p->image_params.rotate % 180 == 90)
             MPSWAP(int, cw, ch);
 
@@ -2644,10 +2661,10 @@ static void pass_render_frame_dumb(struct gl_video *p)
         t.t[0] += off[i].t[0];
         t.t[1] += off[i].t[1];
 
-        gl_transform_trans(tex[i].transform, &t);
-        tex[i].transform = t;
+        gl_transform_trans(img[i].transform, &t);
+        img[i].transform = t;
 
-        copy_img_tex(p, &index, tex[i]);
+        copy_image(p, &index, img[i]);
     }
 
     pass_convert_yuv(p);
@@ -2662,8 +2679,8 @@ static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t
     p->texture_h = p->image_params.h;
     p->texture_offset = identity_trans;
     p->components = 0;
-    p->saved_tex_num = 0;
-    p->hook_fbo_num = 0;
+    p->num_saved_imgs = 0;
+    p->idx_hook_textures = 0;
     p->use_linear = false;
 
     // try uploading the frame
@@ -2693,10 +2710,10 @@ static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t
             .w = p->texture_w, .h = p->texture_h,
             .display_par = scale[1] / scale[0], // counter compensate scaling
         };
-        finish_pass_fbo(p, &p->blend_subs_fbo, rect.w, rect.h, 0);
-        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
-                      p->blend_subs_fbo.fbo, false);
-        pass_read_fbo(p, &p->blend_subs_fbo);
+        finish_pass_tex(p, &p->blend_subs_tex, rect.w, rect.h);
+        struct ra_fbo fbo = { p->blend_subs_tex };
+        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect, fbo, false);
+        pass_read_tex(p, p->blend_subs_tex);
         pass_describe(p, "blend subs video");
     }
     pass_opt_hook_point(p, "MAIN", &p->texture_offset);
@@ -2723,10 +2740,10 @@ static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t
             pass_delinearize(p->sc, p->image_params.color.gamma);
             p->use_linear = false;
         }
-        finish_pass_fbo(p, &p->blend_subs_fbo, p->texture_w, p->texture_h, 0);
-        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect,
-                      p->blend_subs_fbo.fbo, false);
-        pass_read_fbo(p, &p->blend_subs_fbo);
+        finish_pass_tex(p, &p->blend_subs_tex, p->texture_w, p->texture_h);
+        struct ra_fbo fbo = { p->blend_subs_tex };
+        pass_draw_osd(p, OSD_DRAW_SUB_ONLY, vpts, rect, fbo, false);
+        pass_read_tex(p, p->blend_subs_tex);
         pass_describe(p, "blend subs");
     }
 
@@ -2735,7 +2752,7 @@ static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t
     return true;
 }
 
-static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
+static void pass_draw_to_screen(struct gl_video *p, struct ra_fbo fbo)
 {
     if (p->dumb_mode)
         pass_render_frame_dumb(p);
@@ -2749,15 +2766,15 @@ static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
 
     pass_colormanage(p, p->image_params.color, false);
 
-    // Since finish_pass_direct doesn't work with compute shaders, and neither
+    // Since finish_pass_fbo doesn't work with compute shaders, and neither
     // does the checkerboard/dither code, we may need an indirection via
-    // p->screen_fbo here.
+    // p->screen_tex here.
     if (p->pass_compute.active) {
         int o_w = p->dst_rect.x1 - p->dst_rect.x0,
             o_h = p->dst_rect.y1 - p->dst_rect.y0;
-        finish_pass_fbo(p, &p->screen_fbo, o_w, o_h, FBOTEX_FUZZY);
-        struct img_tex tmp = img_tex_fbo(&p->screen_fbo, PLANE_RGB, p->components);
-        copy_img_tex(p, &(int){0}, tmp);
+        finish_pass_tex(p, &p->screen_tex, o_w, o_h);
+        struct image tmp = image_wrap(p->screen_tex, PLANE_RGB, p->components);
+        copy_image(p, &(int){0}, tmp);
     }
 
     if (p->has_alpha){
@@ -2765,14 +2782,16 @@ static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
             // Draw checkerboard pattern to indicate transparency
             GLSLF("// transparency checkerboard\n");
             GLSL(bvec2 tile = lessThan(fract(gl_FragCoord.xy * 1.0/32.0), vec2(0.5));)
-            GLSL(vec3 background = vec3(tile.x == tile.y ? 1.0 : 0.75);)
-            GLSL(color.rgb = mix(background, color.rgb, color.a);)
+            GLSL(vec3 background = vec3(tile.x == tile.y ? 0.93 : 0.87);)
+            GLSL(color.rgb += background.rgb * (1.0 - color.a);)
+            GLSL(color.a = 1.0;)
         } else if (p->opts.alpha_mode == ALPHA_BLEND) {
             // Blend into background color (usually black)
             struct m_color c = p->opts.background;
             GLSLF("vec4 background = vec4(%f, %f, %f, %f);\n",
                   c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0);
-            GLSL(color = mix(background, vec4(color.rgb, 1.0), color.a);)
+            GLSL(color.rgb += background.rgb * (1.0 - color.a);)
+            GLSL(color.a = background.a;)
         }
     }
 
@@ -2780,11 +2799,11 @@ static void pass_draw_to_screen(struct gl_video *p, struct fbodst fbo)
 
     pass_dither(p);
     pass_describe(p, "output to screen");
-    finish_pass_direct(p, fbo, &p->dst_rect);
+    finish_pass_fbo(p, fbo, &p->dst_rect);
 }
 
-static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
-                              uint64_t id, struct fbosurface *surf)
+static bool update_surface(struct gl_video *p, struct mp_image *mpi,
+                           uint64_t id, struct surface *surf)
 {
     int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
         vp_h = p->dst_rect.y1 - p->dst_rect.y0;
@@ -2801,7 +2820,7 @@ static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
         pass_linearize(p->sc, p->image_params.color.gamma);
     }
 
-    finish_pass_fbo(p, &surf->fbotex, vp_w, vp_h, FBOTEX_FUZZY);
+    finish_pass_tex(p, &surf->tex, vp_w, vp_h);
     surf->id  = id;
     surf->pts = mpi->pts;
     return true;
@@ -2809,7 +2828,7 @@ static bool update_fbosurface(struct gl_video *p, struct mp_image *mpi,
 
 // Draws an interpolate frame to fbo, based on the frame timing in t
 static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
-                                       struct fbodst fbo)
+                                       struct ra_fbo fbo)
 {
     bool is_new = false;
 
@@ -2822,8 +2841,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // First of all, figure out if we have a frame available at all, and draw
     // it manually + reset the queue if not
     if (p->surfaces[p->surface_now].id == 0) {
-        struct fbosurface *now = &p->surfaces[p->surface_now];
-        if (!update_fbosurface(p, t->current, t->frame_id, now))
+        struct surface *now = &p->surfaces[p->surface_now];
+        if (!update_surface(p, t->current, t->frame_id, now))
             return;
         p->surface_idx = p->surface_now;
         is_new = true;
@@ -2831,13 +2850,13 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
 
     // Find the right frame for this instant
     if (t->current) {
-        int next = fbosurface_wrap(p->surface_now + 1);
+        int next = surface_wrap(p->surface_now + 1);
         while (p->surfaces[next].id &&
                p->surfaces[next].id > p->surfaces[p->surface_now].id &&
                p->surfaces[p->surface_now].id < t->frame_id)
         {
             p->surface_now = next;
-            next = fbosurface_wrap(next + 1);
+            next = surface_wrap(next + 1);
         }
     }
 
@@ -2856,20 +2875,19 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     } else {
         assert(tscale->kernel && !tscale->kernel->polar);
         size = ceil(tscale->kernel->size);
-        assert(size <= TEXUNIT_VIDEO_NUM);
     }
 
     int radius = size/2;
     int surface_now = p->surface_now;
-    int surface_bse = fbosurface_wrap(surface_now - (radius-1));
-    int surface_end = fbosurface_wrap(surface_now + radius);
-    assert(fbosurface_wrap(surface_bse + size-1) == surface_end);
+    int surface_bse = surface_wrap(surface_now - (radius-1));
+    int surface_end = surface_wrap(surface_now + radius);
+    assert(surface_wrap(surface_bse + size-1) == surface_end);
 
     // Render new frames while there's room in the queue. Note that technically,
     // this should be done before the step where we find the right frame, but
     // it only barely matters at the very beginning of playback, and this way
     // makes the code much more linear.
-    int surface_dst = fbosurface_wrap(p->surface_idx + 1);
+    int surface_dst = surface_wrap(p->surface_idx + 1);
     for (int i = 0; i < t->num_frames; i++) {
         // Avoid overwriting data we might still need
         if (surface_dst == surface_bse - 1)
@@ -2881,11 +2899,11 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
             continue;
 
         if (f_id > p->surfaces[p->surface_idx].id) {
-            struct fbosurface *dst = &p->surfaces[surface_dst];
-            if (!update_fbosurface(p, f, f_id, dst))
+            struct surface *dst = &p->surfaces[surface_dst];
+            if (!update_surface(p, f, f_id, dst))
                 return;
             p->surface_idx = surface_dst;
-            surface_dst = fbosurface_wrap(surface_dst + 1);
+            surface_dst = surface_wrap(surface_dst + 1);
             is_new = true;
         }
     }
@@ -2897,7 +2915,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // end of playback or start of playback.
     bool valid = true;
     for (int i = surface_bse, ii; valid && i != surface_end; i = ii) {
-        ii = fbosurface_wrap(i + 1);
+        ii = surface_wrap(i + 1);
         if (p->surfaces[i].id == 0 || p->surfaces[ii].id == 0) {
             valid = false;
         } else if (p->surfaces[ii].id < p->surfaces[i].id) {
@@ -2915,7 +2933,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     pass_describe(p, "interpolation");
     if (!valid || t->still) {
         // surface_now is guaranteed to be valid, so we can safely use it.
-        pass_read_fbo(p, &p->surfaces[surface_now].fbotex);
+        pass_read_tex(p, p->surfaces[surface_now].tex);
         p->is_interpolated = false;
     } else {
         double mix = t->vsync_offset / t->ideal_frame_duration;
@@ -2923,7 +2941,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
         // so we try to adjust by using the previous set of N frames instead
         // (which requires some extra checking to make sure it's valid)
         if (mix < 0.0) {
-            int prev = fbosurface_wrap(surface_bse - 1);
+            int prev = surface_wrap(surface_bse - 1);
             if (p->surfaces[prev].id != 0 &&
                 p->surfaces[prev].id < p->surfaces[surface_bse].id)
             {
@@ -2949,20 +2967,22 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
 
         // Blend the frames together
         if (oversample || linear) {
+            gl_sc_uniform_dynamic(p->sc);
             gl_sc_uniform_f(p->sc, "inter_coeff", mix);
             GLSL(color = mix(texture(texture0, texcoord0),
                              texture(texture1, texcoord1),
                              inter_coeff);)
         } else {
+            gl_sc_uniform_dynamic(p->sc);
             gl_sc_uniform_f(p->sc, "fcoord", mix);
             pass_sample_separated_gen(p->sc, tscale, 0, 0);
         }
 
         // Load all the required frames
         for (int i = 0; i < size; i++) {
-            struct img_tex img =
-                img_tex_fbo(&p->surfaces[fbosurface_wrap(surface_bse+i)].fbotex,
-                            PLANE_RGB, p->components);
+            struct image img =
+                image_wrap(p->surfaces[surface_wrap(surface_bse+i)].tex,
+                           PLANE_RGB, p->components);
             // Since the code in pass_sample_separated currently assumes
             // the textures are bound in-order and starting at 0, we just
             // assert to make sure this is the case (which it should always be)
@@ -2970,8 +2990,8 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
             assert(id == i);
         }
 
-        MP_DBG(p, "inter frame dur: %f vsync: %f, mix: %f\n",
-               t->ideal_frame_duration, t->vsync_interval, mix);
+        MP_TRACE(p, "inter frame dur: %f vsync: %f, mix: %f\n",
+                 t->ideal_frame_duration, t->vsync_interval, mix);
         p->is_interpolated = true;
     }
     pass_draw_to_screen(p, fbo);
@@ -2980,9 +3000,11 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
 }
 
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct fbodst target)
+                           struct ra_fbo fbo)
 {
-    struct mp_rect target_rc = {0, 0, target.tex->params.w, target.tex->params.h};
+    gl_video_update_options(p);
+
+    struct mp_rect target_rc = {0, 0, fbo.tex->params.w, fbo.tex->params.h};
 
     p->broken_frame = false;
 
@@ -2991,18 +3013,18 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
     if (!has_frame || !mp_rect_equals(&p->dst_rect, &target_rc)) {
         struct m_color c = p->clear_color;
         float color[4] = {c.r / 255.0, c.g / 255.0, c.b / 255.0, c.a / 255.0};
-        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
+        p->ra->fns->clear(p->ra, fbo.tex, color, &target_rc);
     }
 
-    if (p->hwdec_active && p->hwdec->driver->overlay_frame) {
+    if (p->hwdec_overlay) {
         if (has_frame) {
-            float *color = p->hwdec->overlay_colorkey;
-            p->ra->fns->clear(p->ra, target.tex, color, &p->dst_rect);
+            float *color = p->hwdec_overlay->overlay_colorkey;
+            p->ra->fns->clear(p->ra, fbo.tex, color, &p->dst_rect);
         }
 
-        p->hwdec->driver->overlay_frame(p->hwdec, frame->current,
-                                        &p->src_rect, &p->dst_rect,
-                                        frame->frame_id != p->image.id);
+        p->hwdec_overlay->driver->overlay_frame(p->hwdec_overlay, frame->current,
+                                                &p->src_rect, &p->dst_rect,
+                                                frame->frame_id != p->image.id);
 
         if (frame->current)
             p->osd_pts = frame->current->pts;
@@ -3021,7 +3043,7 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
         }
 
         if (interpolate) {
-            gl_video_interpolate_frame(p, frame, target);
+            gl_video_interpolate_frame(p, frame, fbo);
         } else {
             bool is_new = frame->frame_id != p->image.id;
 
@@ -3029,41 +3051,42 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
             if (frame->still && p->opts.blend_subs)
                 is_new = true;
 
-            if (is_new || !p->output_fbo_valid) {
-                p->output_fbo_valid = false;
+            if (is_new || !p->output_tex_valid) {
+                p->output_tex_valid = false;
 
                 pass_info_reset(p, !is_new);
                 if (!pass_render_frame(p, frame->current, frame->frame_id))
                     goto done;
 
                 // For the non-interpolation case, we draw to a single "cache"
-                // FBO to speed up subsequent re-draws (if any exist)
-                struct fbodst dest_fbo = target;
+                // texture to speed up subsequent re-draws (if any exist)
+                struct ra_fbo dest_fbo = fbo;
                 if (frame->num_vsyncs > 1 && frame->display_synced &&
                     !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
                 {
-                    fbotex_change(&p->output_fbo, p->ra, p->log,
-                                  target.tex->params.w, target.tex->params.h,
-                                  p->fbo_format, FBOTEX_FUZZY);
-                    dest_fbo = p->output_fbo.fbo;
-                    p->output_fbo_valid = true;
+                    bool r = ra_tex_resize(p->ra, p->log, &p->output_tex,
+                                           fbo.tex->params.w, fbo.tex->params.h,
+                                           p->fbo_format);
+                    if (r) {
+                        dest_fbo = (struct ra_fbo) { p->output_tex };
+                        p->output_tex_valid = true;
+                    }
                 }
                 pass_draw_to_screen(p, dest_fbo);
             }
 
-            // "output fbo valid" and "output fbo needed" are equivalent
-            if (p->output_fbo_valid) {
+            // "output tex valid" and "output tex needed" are equivalent
+            if (p->output_tex_valid) {
                 pass_info_reset(p, true);
                 pass_describe(p, "redraw cached frame");
                 struct mp_rect src = p->dst_rect;
                 struct mp_rect dst = src;
-                if (target.flip) {
-                    dst.y0 = target.tex->params.h - src.y0;
-                    dst.y1 = target.tex->params.h - src.y1;
+                if (fbo.flip) {
+                    dst.y0 = fbo.tex->params.h - src.y0;
+                    dst.y1 = fbo.tex->params.h - src.y1;
                 }
                 timer_pool_start(p->blit_timer);
-                p->ra->fns->blit(p->ra, target.tex, p->output_fbo.tex,
-                                 &dst, &src);
+                p->ra->fns->blit(p->ra, fbo.tex, p->output_tex, &dst, &src);
                 timer_pool_stop(p->blit_timer);
                 pass_record(p, timer_pool_measure(p->blit_timer));
             }
@@ -3072,8 +3095,6 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
 
 done:
 
-    unmap_current_image(p);
-
     debug_check_gl(p, "after video rendering");
 
     if (p->osd) {
@@ -3084,7 +3105,7 @@ done:
             pass_info_reset(p, true);
 
         pass_draw_osd(p, p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0,
-                      p->osd_pts, p->osd_rect, target, true);
+                      p->osd_pts, p->osd_rect, fbo, true);
         debug_check_gl(p, "after OSD rendering");
     }
 
@@ -3092,17 +3113,7 @@ done:
         // Make the screen solid blue to make it visually clear that an
         // error has occurred
         float color[4] = {0.0, 0.05, 0.5, 1.0};
-        p->ra->fns->clear(p->ra, target.tex, color, &target_rc);
-    }
-
-    // The playloop calls this last before waiting some time until it decides
-    // to call flip_page(). Tell OpenGL to start execution of the GPU commands
-    // while we sleep (this happens asynchronously).
-    if ((p->opts.early_flush == -1 && !frame->display_synced) ||
-        p->opts.early_flush == 1)
-    {
-        if (p->ra->fns->flush)
-            p->ra->fns->flush(p->ra);
+        p->ra->fns->clear(p->ra, fbo.tex, color, &target_rc);
     }
 
     p->frames_rendered++;
@@ -3148,7 +3159,7 @@ void gl_video_resize(struct gl_video *p,
 
 static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
 {
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
+    for (int i = 0; i < VO_PASS_PERF_MAX; i++) {
         if (!pass[i].desc.len)
             break;
         out->perf[out->count] = pass[i].perf;
@@ -3169,14 +3180,14 @@ static void reinterleave_vdpau(struct gl_video *p,
                                struct ra_tex *input[4], struct ra_tex *output[2])
 {
     for (int n = 0; n < 2; n++) {
-        struct fbotex *fbo = &p->vdpau_deinterleave_fbo[n];
+        struct ra_tex **tex = &p->vdpau_deinterleave_tex[n];
         // This is an array of the 2 to-merge planes.
         struct ra_tex **src = &input[n * 2];
         int w = src[0]->params.w;
         int h = src[0]->params.h;
         int ids[2];
         for (int t = 0; t < 2; t++) {
-            ids[t] = pass_bind(p, (struct img_tex){
+            ids[t] = pass_bind(p, (struct image){
                 .tex = src[t],
                 .multiplier = 1.0,
                 .transform = identity_trans,
@@ -3185,18 +3196,18 @@ static void reinterleave_vdpau(struct gl_video *p,
             });
         }
 
+        pass_describe(p, "vdpau reinterleaving");
         GLSLF("color = fract(gl_FragCoord.y * 0.5) < 0.5\n");
         GLSLF("      ? texture(texture%d, texcoord%d)\n", ids[0], ids[0]);
         GLSLF("      : texture(texture%d, texcoord%d);", ids[1], ids[1]);
 
-        const struct ra_format *fmt =
-            ra_find_unorm_format(p->ra, 1, n == 0 ? 1 : 2);
-        fbotex_change(fbo, p->ra, p->log, w, h * 2, fmt, 0);
-
-        pass_describe(p, "vdpau reinterleaving");
-        finish_pass_direct(p, fbo->fbo, &(struct mp_rect){0, 0, w, h * 2});
+        int comps = n == 0 ? 1 : 2;
+        const struct ra_format *fmt = ra_find_unorm_format(p->ra, 1, comps);
+        ra_tex_resize(p->ra, p->log, tex, w, h * 2, fmt);
+        struct ra_fbo fbo = { *tex };
+        finish_pass_fbo(p, fbo, &(struct mp_rect){0, 0, w, h * 2});
 
-        output[n] = fbo->tex;
+        output[n] = *tex;
     }
 }
 
@@ -3262,8 +3273,6 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t
     for (int n = 0; n < p->plane_count; n++) {
         struct texplane *plane = &vimg->planes[n];
 
-        plane->flipped = mpi->stride[0] < 0;
-
         struct ra_tex_upload_params params = {
             .tex = plane->tex,
             .src = mpi->planes[n],
@@ -3271,6 +3280,13 @@ static bool pass_upload_image(struct gl_video *p, struct mp_image *mpi, uint64_t
             .stride = mpi->stride[n],
         };
 
+        plane->flipped = params.stride < 0;
+        if (plane->flipped) {
+            int h = mp_image_plane_h(mpi, n);
+            params.src = (char *)params.src + (h - 1) * params.stride;
+            params.stride = -params.stride;
+        }
+
         struct dr_buffer *mapped = gl_find_dr_buffer(p, mpi->planes[n]);
         if (mapped) {
             params.buf = mapped->buf;
@@ -3310,9 +3326,9 @@ error:
 static bool test_fbo(struct gl_video *p, const struct ra_format *fmt)
 {
     MP_VERBOSE(p, "Testing FBO format %s\n", fmt->name);
-    struct fbotex fbo = {0};
-    bool success = fbotex_change(&fbo, p->ra, p->log, 16, 16, fmt, 0);
-    fbotex_uninit(&fbo);
+    struct ra_tex *tex = NULL;
+    bool success = ra_tex_resize(p->ra, p->log, &tex, 16, 16, fmt);
+    ra_tex_free(p->ra, &tex);
     return success;
 }
 
@@ -3359,7 +3375,8 @@ static void check_gl_features(struct gl_video *p)
     bool have_compute = ra->caps & RA_CAP_COMPUTE;
     bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
 
-    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgb10_a2", "rgba8", 0};
+    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgba16hf",
+                                   "rgb10_a2", "rgba8", 0};
     const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
     const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
                           ? user_fbo_fmts : auto_fbo_fmts;
@@ -3388,7 +3405,6 @@ static void check_gl_features(struct gl_video *p)
                        "Most extended features will be disabled.\n");
         }
         p->dumb_mode = true;
-        p->use_lut_3d = false;
         // Most things don't work, so whitelist all options that still work.
         p->opts = (struct gl_video_opts){
             .gamma = p->opts.gamma,
@@ -3409,9 +3425,13 @@ static void check_gl_features(struct gl_video *p)
             .tone_mapping_param = p->opts.tone_mapping_param,
             .tone_mapping_desat = p->opts.tone_mapping_desat,
             .early_flush = p->opts.early_flush,
+            .icc_opts = p->opts.icc_opts,
+            .hwdec_interop = p->opts.hwdec_interop,
         };
         for (int n = 0; n < SCALER_COUNT; n++)
             p->opts.scaler[n] = gl_video_opts_def.scaler[n];
+        if (!have_fbo)
+            p->use_lut_3d = false;
         return;
     }
     p->dumb_mode = false;
@@ -3463,6 +3483,19 @@ static void check_gl_features(struct gl_video *p)
         p->opts.compute_hdr_peak = 0;
         MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
     }
+    if (!(ra->caps & RA_CAP_FRAGCOORD) && p->opts.dither_depth >= 0 &&
+        p->opts.dither_algo != DITHER_NONE)
+    {
+        p->opts.dither_algo = DITHER_NONE;
+        MP_WARN(p, "Disabling dithering (no gl_FragCoord).\n");
+    }
+    if (!(ra->caps & RA_CAP_FRAGCOORD) &&
+        p->opts.alpha_mode == ALPHA_BLEND_TILES)
+    {
+        p->opts.alpha_mode = ALPHA_BLEND;
+        // Verbose, since this is the default setting
+        MP_VERBOSE(p, "Disabling alpha checkerboard (no gl_FragCoord).\n");
+    }
 }
 
 static void init_gl(struct gl_video *p)
@@ -3486,6 +3519,10 @@ void gl_video_uninit(struct gl_video *p)
 
     uninit_video(p);
 
+    for (int n = 0; n < p->num_hwdecs; n++)
+        ra_hwdec_uninit(p->hwdecs[n]);
+    p->num_hwdecs = 0;
+
     gl_sc_destroy(p->sc);
 
     ra_tex_free(p->ra, &p->lut_3d_texture);
@@ -3495,7 +3532,7 @@ void gl_video_uninit(struct gl_video *p)
     timer_pool_destroy(p->blit_timer);
     timer_pool_destroy(p->osd_timer);
 
-    for (int i = 0; i < PASS_INFO_MAX; i++) {
+    for (int i = 0; i < VO_PASS_PERF_MAX; i++) {
         talloc_free(p->pass_fresh[i].desc.start);
         talloc_free(p->pass_redraw[i].desc.start);
     }
@@ -3540,8 +3577,10 @@ bool gl_video_check_format(struct gl_video *p, int mp_format)
     if (ra_get_imgfmt_desc(p->ra, mp_format, &desc) &&
         is_imgfmt_desc_supported(p, &desc))
         return true;
-    if (p->hwdec && ra_hwdec_test_format(p->hwdec, mp_format))
-        return true;
+    for (int n = 0; n < p->num_hwdecs; n++) {
+        if (ra_hwdec_test_format(p->hwdecs[n], mp_format))
+            return true;
+    }
     return false;
 }
 
@@ -3588,6 +3627,14 @@ struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
     p->opts = *opts;
     for (int n = 0; n < SCALER_COUNT; n++)
         p->scaler[n] = (struct scaler){.index = n};
+    // our VAO always has the vec2 position as the first element
+    MP_TARRAY_APPEND(p, p->vao, p->vao_len, (struct ra_renderpass_input) {
+        .name = "position",
+        .type = RA_VARTYPE_FLOAT,
+        .dim_v = 2,
+        .dim_m = 1,
+        .offset = 0,
+    });
     init_gl(p);
     reinit_from_options(p);
     return p;
@@ -3612,12 +3659,15 @@ static const char *handle_scaler_opt(const char *name, bool tscale)
     return NULL;
 }
 
-void gl_video_update_options(struct gl_video *p)
+static void gl_video_update_options(struct gl_video *p)
 {
     if (m_config_cache_update(p->opts_cache)) {
         gl_lcms_update_options(p->cms);
         reinit_from_options(p);
     }
+
+    if (mp_csp_equalizer_state_changed(p->video_eq))
+        p->output_tex_valid = false;
 }
 
 static void reinit_from_options(struct gl_video *p)
@@ -3648,6 +3698,8 @@ static void reinit_from_options(struct gl_video *p)
 
 void gl_video_configure_queue(struct gl_video *p, struct vo *vo)
 {
+    gl_video_update_options(p);
+
     int queue_size = 1;
 
     // Figure out an adequate size for the interpolation queue. The larger
@@ -3742,19 +3794,12 @@ float gl_video_scale_ambient_lux(float lmin, float lmax,
 void gl_video_set_ambient_lux(struct gl_video *p, int lux)
 {
     if (p->opts.gamma_auto) {
-        float gamma = gl_video_scale_ambient_lux(16.0, 64.0, 2.40, 1.961, lux);
-        MP_VERBOSE(p, "ambient light changed: %dlux (gamma: %f)\n", lux, gamma);
-        p->opts.gamma = MPMIN(1.0, 1.961 / gamma);
+        p->opts.gamma = gl_video_scale_ambient_lux(16.0, 256.0, 1.0, 1.2, lux);
+        MP_TRACE(p, "ambient light changed: %d lux (gamma: %f)\n", lux,
+                 p->opts.gamma);
     }
 }
 
-void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec)
-{
-    unref_current_image(p);
-    ra_hwdec_mapper_free(&p->hwdec_mapper);
-    p->hwdec = hwdec;
-}
-
 static void *gl_video_dr_alloc_buffer(struct gl_video *p, size_t size)
 {
     struct ra_buf_params params = {
@@ -3811,3 +3856,46 @@ struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h
         gl_video_dr_free_buffer(p, ptr);
     return res;
 }
+
+static void load_add_hwdec(struct gl_video *p, struct mp_hwdec_devices *devs,
+                           const struct ra_hwdec_driver *drv, bool is_auto)
+{
+    struct ra_hwdec *hwdec =
+        ra_hwdec_load_driver(p->ra, p->log, p->global, devs, drv, is_auto);
+    if (hwdec)
+        MP_TARRAY_APPEND(p, p->hwdecs, p->num_hwdecs, hwdec);
+}
+
+void gl_video_load_hwdecs(struct gl_video *p, struct mp_hwdec_devices *devs,
+                          bool load_all_by_default)
+{
+    char *type = p->opts.hwdec_interop;
+    if (!type || !type[0] || strcmp(type, "auto") == 0) {
+        if (!load_all_by_default)
+            return;
+        type = "all";
+    }
+    if (strcmp(type, "no") == 0) {
+        // do nothing, just block further loading
+    } else if (strcmp(type, "all") == 0) {
+        gl_video_load_hwdecs_all(p, devs);
+    } else {
+        for (int n = 0; ra_hwdec_drivers[n]; n++) {
+            const struct ra_hwdec_driver *drv = ra_hwdec_drivers[n];
+            if (strcmp(type, drv->name) == 0) {
+                load_add_hwdec(p, devs, drv, false);
+                break;
+            }
+        }
+    }
+    p->hwdec_interop_loading_done = true;
+}
+
+void gl_video_load_hwdecs_all(struct gl_video *p, struct mp_hwdec_devices *devs)
+{
+    if (!p->hwdec_interop_loading_done) {
+        for (int n = 0; ra_hwdec_drivers[n]; n++)
+            load_add_hwdec(p, devs, ra_hwdec_drivers[n], true);
+        p->hwdec_interop_loading_done = true;
+    }
+}
diff --git a/video/out/opengl/video.h b/video/out/gpu/video.h
index d163bc8..78f8828 100644
--- a/video/out/opengl/video.h
+++ b/video/out/gpu/video.h
@@ -27,11 +27,6 @@
 #include "shader_cache.h"
 #include "video/csputils.h"
 #include "video/out/filter_kernels.h"
-#include "video/out/vo.h"
-
-// Assume we have this many texture units for sourcing additional passes.
-// The actual texture unit assignment is dynamic.
-#define TEXUNIT_VIDEO_NUM 6
 
 struct scaler_fun {
     char *name;
@@ -56,7 +51,7 @@ struct scaler {
     bool initialized;
     struct filter_kernel *kernel;
     struct ra_tex *lut;
-    struct fbotex sep_fbo;
+    struct ra_tex *sep_fbo;
     bool insufficient;
     int lut_size;
 
@@ -144,6 +139,7 @@ struct gl_video_opts {
     struct mp_icc_opts *icc_opts;
     int early_flush;
     char *shader_cache_dir;
+    char *hwdec_interop;
 };
 
 extern const struct m_sub_options gl_video_conf;
@@ -155,12 +151,11 @@ struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
                                struct mpv_global *g);
 void gl_video_uninit(struct gl_video *p);
 void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
-void gl_video_update_options(struct gl_video *p);
 bool gl_video_check_format(struct gl_video *p, int mp_format);
 void gl_video_config(struct gl_video *p, struct mp_image_params *params);
 void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct fbodst target);
+                           struct ra_fbo fbo);
 void gl_video_resize(struct gl_video *p,
                      struct mp_rect *src, struct mp_rect *dst,
                      struct mp_osd_res *osd);
@@ -182,8 +177,10 @@ struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p);
 void gl_video_reset(struct gl_video *p);
 bool gl_video_showing_interpolated_frame(struct gl_video *p);
 
-struct ra_hwdec;
-void gl_video_set_hwdec(struct gl_video *p, struct ra_hwdec *hwdec);
+struct mp_hwdec_devices;
+void gl_video_load_hwdecs(struct gl_video *p, struct mp_hwdec_devices *devs,
+                          bool load_all_by_default);
+void gl_video_load_hwdecs_all(struct gl_video *p, struct mp_hwdec_devices *devs);
 
 struct vo;
 void gl_video_configure_queue(struct gl_video *p, struct vo *vo);
diff --git a/video/out/opengl/video_shaders.c b/video/out/gpu/video_shaders.c
index 60c5ce8..3e71c31 100644
--- a/video/out/opengl/video_shaders.c
+++ b/video/out/gpu/video_shaders.c
@@ -97,11 +97,11 @@ void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler
 }
 
 // Subroutine for computing and adding an individual texel contribution
-// If subtexel < 0 and offset < 0, samples directly.
-// If subtexel >= 0, takes the texel from cN[subtexel]
-// If offset >= 0, takes the texel from inN[rel.y+y+offset][rel.x+x+offset]
+// If planar is false, samples directly
+// If planar is true, takes the pixel from inX[idx] where X is the component and
+// `idx` must be defined by the caller
 static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
-                         int x, int y, int subtexel, int offset, int components)
+                         int x, int y, int components, bool planar)
 {
     double radius = scaler->kernel->f.radius * scaler->kernel->filter_scale;
     double radius_cutoff = scaler->kernel->radius_cutoff;
@@ -130,19 +130,12 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
     }
     GLSL(wsum += w;)
 
-    if (subtexel < 0 && offset < 0) {
-        GLSLF("c0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
-        GLSL(color += vec4(w) * c0;)
-    } else if (subtexel >= 0) {
+    if (planar) {
         for (int n = 0; n < components; n++)
-            GLSLF("color[%d] += w * c%d[%d];\n", n, n, subtexel);
-    } else if (offset >= 0) {
-        for (int n = 0; n <components; n++)
-            GLSLF("color[%d] += w * in%d[rel.y+%d][rel.x+%d];\n", n, n,
-                  y + offset, x + offset);
+            GLSLF("color[%d] += w * in%d[idx];\n", n, n);
     } else {
-        // invalid usage
-        abort();
+        GLSLF("in0 = texture(tex, base + pt * vec2(%d.0, %d.0));\n", x, y);
+        GLSL(color += vec4(w) * in0;)
     }
 
     if (maybe_skippable)
@@ -150,7 +143,7 @@ static void polar_sample(struct gl_shader_cache *sc, struct scaler *scaler,
 }
 
 void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                       int components, int glsl_version)
+                       int components, bool sup_gather)
 {
     GLSL(color = vec4(0.0);)
     GLSLF("{\n");
@@ -158,7 +151,8 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
     GLSL(vec2 base = pos - fcoord * pt;)
     GLSLF("float w, d, wsum = 0.0;\n");
     for (int n = 0; n < components; n++)
-        GLSLF("vec4 c%d;\n", n);
+        GLSLF("vec4 in%d;\n", n);
+    GLSL(int idx;)
 
     gl_sc_uniform_texture(sc, "lut", scaler->lut);
 
@@ -173,15 +167,14 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
             // exactly when all four texels are within bounds
             bool use_gather = sqrt(x*x + y*y) < scaler->kernel->radius_cutoff;
 
-            // textureGather is only supported in GLSL 400+
-            if (glsl_version < 400)
+            if (!sup_gather)
                 use_gather = false;
 
             if (use_gather) {
                 // Gather the four surrounding texels simultaneously
                 for (int n = 0; n < components; n++) {
-                    GLSLF("c%d = textureGatherOffset(tex, base, ivec2(%d, %d), %d);\n",
-                          n, x, y, n);
+                    GLSLF("in%d = textureGatherOffset(tex, base, "
+                          "ivec2(%d, %d), %d);\n", n, x, y, n);
                 }
 
                 // Mix in all of the points with their weights
@@ -192,13 +185,14 @@ void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
                     static const int yo[4] = {1, 1, 0, 0};
                     if (x+xo[p] > bound || y+yo[p] > bound)
                         continue;
-                    polar_sample(sc, scaler, x+xo[p], y+yo[p], p, -1, components);
+                    GLSLF("idx = %d;\n", p);
+                    polar_sample(sc, scaler, x+xo[p], y+yo[p], components, true);
                 }
             } else {
                 // switch to direct sampling instead, for efficiency/compatibility
                 for (int yy = y; yy <= bound && yy <= y+1; yy++) {
                     for (int xx = x; xx <= bound && xx <= x+1; xx++)
-                        polar_sample(sc, scaler, xx, yy, -1, -1, components);
+                        polar_sample(sc, scaler, xx, yy, components, false);
                 }
             }
         }
@@ -223,20 +217,20 @@ void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
     GLSL(vec2 fcoord = fract(pos * size - vec2(0.5));)
     GLSL(vec2 base = pos - pt * fcoord;)
     GLSL(ivec2 rel = ivec2(round((base - wbase) * size));)
+    GLSL(int idx;)
     GLSLF("float w, d, wsum = 0.0;\n");
     gl_sc_uniform_texture(sc, "lut", scaler->lut);
 
     // Load all relevant texels into shmem
-    gl_sc_enable_extension(sc, "GL_ARB_arrays_of_arrays");
     for (int c = 0; c < components; c++)
-        GLSLHF("shared float in%d[%d][%d];\n", c, ih, iw);
+        GLSLHF("shared float in%d[%d];\n", c, ih * iw);
 
     GLSL(vec4 c;)
     GLSLF("for (int y = int(gl_LocalInvocationID.y); y < %d; y += %d) {\n", ih, bh);
     GLSLF("for (int x = int(gl_LocalInvocationID.x); x < %d; x += %d) {\n", iw, bw);
     GLSLF("c = texture(tex, wbase + pt * vec2(x - %d, y - %d));\n", offset, offset);
     for (int c = 0; c < components; c++)
-        GLSLF("in%d[y][x] = c[%d];\n", c, c);
+        GLSLF("in%d[%d * y + x] = c[%d];\n", c, iw, c);
     GLSLF("}}\n");
     GLSL(groupMemoryBarrier();)
     GLSL(barrier();)
@@ -244,8 +238,11 @@ void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
     // Dispatch the actual samples
     GLSLF("// scaler samples\n");
     for (int y = 1-bound; y <= bound; y++) {
-        for (int x = 1-bound; x <= bound; x++)
-            polar_sample(sc, scaler, x, y, -1, offset, components);
+        for (int x = 1-bound; x <= bound; x++) {
+            GLSLF("idx = %d * rel.y + rel.x + %d;\n", iw,
+                  iw * (y + offset) + x + offset);
+            polar_sample(sc, scaler, x, y, components, true);
+        }
     }
 
     GLSL(color = color / vec4(wsum);)
@@ -567,18 +564,19 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
 {
     GLSLF("// HDR tone mapping\n");
 
-    // Desaturate the color using a coefficient dependent on the luminance
-    GLSL(float luma = dot(dst_luma, color.rgb);)
-    if (desat > 0) {
-        GLSLF("float overbright = max(luma - %f, 1e-6) / max(luma, 1e-6);\n", desat);
-        GLSL(color.rgb = mix(color.rgb, vec3(luma), overbright);)
-    }
-
     // To prevent discoloration due to out-of-bounds clipping, we need to make
     // sure to reduce the value range as far as necessary to keep the entire
     // signal in range, so tone map based on the brightest component.
     GLSL(float sig = max(max(color.r, color.g), color.b);)
-    GLSL(float sig_orig = sig;)
+
+    // Desaturate the color using a coefficient dependent on the signal
+    if (desat > 0) {
+        GLSL(float luma = dot(dst_luma, color.rgb);)
+        GLSL(float coeff = max(sig - 0.18, 1e-6) / max(sig, 1e-6););
+        GLSLF("coeff = pow(coeff, %f);\n", 10.0 / desat);
+        GLSL(color.rgb = mix(color.rgb, vec3(luma), coeff);)
+        GLSL(sig = mix(sig, luma, coeff);) // also make sure to update `sig`
+    }
 
     if (!ref_peak) {
         // For performance, we want to do as few atomic operations on global
@@ -614,6 +612,7 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
         GLSLHF("const float sig_peak = %f;\n", ref_peak);
     }
 
+    GLSL(float sig_orig = sig;)
     switch (algo) {
     case TONE_MAPPING_CLIP:
         GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
@@ -627,7 +626,7 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
         GLSLF("float b = (j*j - 2.0*j*sig_peak + sig_peak) / "
               "max(1e-6, sig_peak - 1.0);\n");
         GLSLF("float scale = (b*b + 2.0*b*j + j*j) / (b-a);\n");
-        GLSL(sig = mix(sig, scale * (sig + a) / (sig + b), sig > j);)
+        GLSL(sig = sig > j ? scale * (sig + a) / (sig + b) : sig;)
         break;
 
     case TONE_MAPPING_REINHARD: {
@@ -770,6 +769,7 @@ static void prng_init(struct gl_shader_cache *sc, AVLFG *lfg)
     // Initialize the PRNG by hashing the position + a random uniform
     GLSL(vec3 _m = vec3(HOOKED_pos, random) + vec3(1.0);)
     GLSL(float h = permute(permute(permute(_m.x)+_m.y)+_m.z);)
+    gl_sc_uniform_dynamic(sc);
     gl_sc_uniform_f(sc, "random", (double)av_lfg_get(lfg) / UINT32_MAX);
 }
 
diff --git a/video/out/opengl/video_shaders.h b/video/out/gpu/video_shaders.h
index 8345e4c..2ae2ac3 100644
--- a/video/out/opengl/video_shaders.h
+++ b/video/out/gpu/video_shaders.h
@@ -30,7 +30,7 @@ void sampler_prelude(struct gl_shader_cache *sc, int tex_num);
 void pass_sample_separated_gen(struct gl_shader_cache *sc, struct scaler *scaler,
                                int d_x, int d_y);
 void pass_sample_polar(struct gl_shader_cache *sc, struct scaler *scaler,
-                       int components, int glsl_version);
+                       int components, bool sup_gather);
 void pass_compute_polar(struct gl_shader_cache *sc, struct scaler *scaler,
                         int components, int bw, int bh, int iw, int ih);
 void pass_sample_bicubic_fast(struct gl_shader_cache *sc);
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
index 3d03c47..fda40da 100644
--- a/video/out/opengl/common.c
+++ b/video/out/opengl/common.c
@@ -31,6 +31,7 @@
 
 #include "common.h"
 #include "common/common.h"
+#include "utils.h"
 
 // This guesses if the current GL context is a suspected software renderer.
 static bool is_software_gl(GL *gl)
@@ -49,14 +50,6 @@ static void GLAPIENTRY dummy_glBindFramebuffer(GLenum target, GLuint framebuffer
     assert(framebuffer == 0);
 }
 
-static bool check_ext(GL *gl, const char *name)
-{
-    const char *exts = gl->extensions;
-    char *s = strstr(exts, name);
-    char *e = s ? s + strlen(name) : NULL;
-    return s && (s == exts || s[-1] == ' ') && (e[0] == ' ' || !e[0]);
-}
-
 #define FN_OFFS(name) offsetof(GL, name)
 
 #define DEF_FN(name)            {FN_OFFS(name), "gl" # name}
@@ -383,6 +376,15 @@ static const struct gl_functions gl_functions[] = {
             {0},
         },
     },
+    // This one overrides GLX_SGI_swap_control on platforms using mesa. The
+    // only difference is that it supports glXSwapInterval(0).
+    {
+        .extension = "GLX_MESA_swap_control",
+        .functions = (const struct gl_function[]) {
+            DEF_FN_NAME(SwapInterval, "glXSwapIntervalMESA"),
+            {0},
+        },
+    },
     {
         .extension = "WGL_EXT_swap_control",
         .functions = (const struct gl_function[]) {
@@ -572,8 +574,8 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
         if (ver_core)
             must_exist = version >= ver_core;
 
-        if (section->extension && check_ext(gl, section->extension))
-            exists = true;
+        if (section->extension)
+            exists = gl_check_extension(gl->extensions, section->extension);
 
         exists |= must_exist;
         if (!exists)
@@ -623,7 +625,7 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
         if (gl->es >= 300)
             gl->glsl_version = 300;
     } else {
-        gl->glsl_version = 110;
+        gl->glsl_version = 120;
         int glsl_major = 0, glsl_minor = 0;
         if (shader && sscanf(shader, "%d.%d", &glsl_major, &glsl_minor) == 2)
             gl->glsl_version = glsl_major * 100 + glsl_minor;
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index 7b2e3ed..b9f582b 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -26,10 +26,10 @@
 #include "common/msg.h"
 #include "misc/bstr.h"
 
-#include "video/out/vo.h"
 #include "video/csputils.h"
-
 #include "video/mp_image.h"
+#include "video/out/vo.h"
+#include "video/out/gpu/ra.h"
 
 #include "gl_headers.h"
 
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index fe454e9..cdaf632 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -1,10 +1,4 @@
 /*
- * common OpenGL routines
- *
- * copyleft (C) 2005-2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
- * Special thanks go to the xine team and Matthias Hopf, whose video_out_opengl.c
- * gave me lots of good ideas.
- *
  * This file is part of mpv.
  *
  * mpv is free software; you can redistribute it and/or
@@ -21,73 +15,10 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdbool.h>
-#include <math.h>
-#include <assert.h>
-
+#include "options/m_config.h"
 #include "context.h"
-#include "common/common.h"
-#include "options/options.h"
-#include "options/m_option.h"
-
-extern const struct mpgl_driver mpgl_driver_x11;
-extern const struct mpgl_driver mpgl_driver_x11egl;
-extern const struct mpgl_driver mpgl_driver_x11_probe;
-extern const struct mpgl_driver mpgl_driver_drm_egl;
-extern const struct mpgl_driver mpgl_driver_drm;
-extern const struct mpgl_driver mpgl_driver_cocoa;
-extern const struct mpgl_driver mpgl_driver_wayland;
-extern const struct mpgl_driver mpgl_driver_w32;
-extern const struct mpgl_driver mpgl_driver_angle;
-extern const struct mpgl_driver mpgl_driver_angle_es2;
-extern const struct mpgl_driver mpgl_driver_dxinterop;
-extern const struct mpgl_driver mpgl_driver_rpi;
-extern const struct mpgl_driver mpgl_driver_mali;
-extern const struct mpgl_driver mpgl_driver_vdpauglx;
-
-static const struct mpgl_driver *const backends[] = {
-#if HAVE_RPI
-    &mpgl_driver_rpi,
-#endif
-#if HAVE_GL_COCOA
-    &mpgl_driver_cocoa,
-#endif
-#if HAVE_EGL_ANGLE_WIN32
-    &mpgl_driver_angle,
-#endif
-#if HAVE_GL_WIN32
-    &mpgl_driver_w32,
-#endif
-#if HAVE_GL_DXINTEROP
-    &mpgl_driver_dxinterop,
-#endif
-#if HAVE_GL_X11
-    &mpgl_driver_x11_probe,
-#endif
-#if HAVE_EGL_X11
-    &mpgl_driver_x11egl,
-#endif
-#if HAVE_GL_X11
-    &mpgl_driver_x11,
-#endif
-#if HAVE_GL_WAYLAND
-    &mpgl_driver_wayland,
-#endif
-#if HAVE_EGL_DRM
-    &mpgl_driver_drm,
-    &mpgl_driver_drm_egl,
-#endif
-#if HAVE_MALI_FBDEV
-    &mpgl_driver_mali,
-#endif
-#if HAVE_VDPAU_GL_X11
-    &mpgl_driver_vdpauglx,
-#endif
-};
+#include "ra_gl.h"
+#include "utils.h"
 
 // 0-terminated list of desktop GL versions a backend should try to
 // initialize. The first entry is the most preferred version.
@@ -103,140 +34,322 @@ const int mpgl_preferred_gl_versions[] = {
     0
 };
 
-int mpgl_find_backend(const char *name)
+enum {
+    FLUSH_NO = 0,
+    FLUSH_YES,
+    FLUSH_AUTO,
+};
+
+enum {
+    GLES_AUTO = 0,
+    GLES_YES,
+    GLES_NO,
+};
+
+struct opengl_opts {
+    int use_glfinish;
+    int waitvsync;
+    int vsync_pattern[2];
+    int swapinterval;
+    int early_flush;
+    int restrict_version;
+    int gles_mode;
+};
+
+#define OPT_BASE_STRUCT struct opengl_opts
+const struct m_sub_options opengl_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_FLAG("opengl-glfinish", use_glfinish, 0),
+        OPT_FLAG("opengl-waitvsync", waitvsync, 0),
+        OPT_INT("opengl-swapinterval", swapinterval, 0),
+        OPT_INTPAIR("opengl-check-pattern", vsync_pattern, 0),
+        OPT_INT("opengl-restrict", restrict_version, 0),
+        OPT_CHOICE("opengl-es", gles_mode, 0,
+                ({"auto", GLES_AUTO}, {"yes", GLES_YES}, {"no", GLES_NO})),
+        OPT_CHOICE("opengl-early-flush", early_flush, 0,
+                ({"no", FLUSH_NO}, {"yes", FLUSH_YES}, {"auto", FLUSH_AUTO})),
+
+        OPT_REPLACED("opengl-debug", "gpu-debug"),
+        OPT_REPLACED("opengl-sw", "gpu-sw"),
+        OPT_REPLACED("opengl-vsync-fences", "swapchain-depth"),
+        OPT_REPLACED("opengl-backend", "gpu-context"),
+        {0},
+    },
+    .defaults = &(const struct opengl_opts) {
+        .swapinterval = 1,
+    },
+    .size = sizeof(struct opengl_opts),
+};
+
+struct priv {
+    GL *gl;
+    struct mp_log *log;
+    struct ra_gl_ctx_params params;
+    struct opengl_opts *opts;
+    struct ra_swapchain_fns fns;
+    GLuint main_fb;
+    struct ra_tex *wrapped_fb; // corresponds to main_fb
+    // for debugging:
+    int frames_rendered;
+    unsigned int prev_sgi_sync_count;
+    // for gl_vsync_pattern
+    int last_pattern;
+    int matches, mismatches;
+    // for swapchain_depth simulation
+    GLsync *vsync_fences;
+    int num_vsync_fences;
+};
+
+bool ra_gl_ctx_test_version(struct ra_ctx *ctx, int version, bool es)
 {
-    if (name == NULL || strcmp(name, "auto") == 0)
-        return -1;
-    for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-        if (strcmp(backends[n]->name, name) == 0)
-            return n;
+    bool ret;
+    struct opengl_opts *opts;
+    void *tmp = talloc_new(NULL);
+    opts = mp_get_config_group(tmp, ctx->global, &opengl_conf);
+
+    // Version too high
+    if (opts->restrict_version && version >= opts->restrict_version) {
+        ret = false;
+        goto done;
     }
-    return -2;
-}
 
-int mpgl_validate_backend_opt(struct mp_log *log, const struct m_option *opt,
-                              struct bstr name, struct bstr param)
-{
-    if (bstr_equals0(param, "help")) {
-        mp_info(log, "OpenGL windowing backends:\n");
-        mp_info(log, "    auto (autodetect)\n");
-        for (int n = 0; n < MP_ARRAY_SIZE(backends); n++)
-            mp_info(log, "    %s\n", backends[n]->name);
-        return M_OPT_EXIT;
+    switch (opts->gles_mode) {
+    case GLES_YES:  ret = es;   goto done;
+    case GLES_NO:   ret = !es;  goto done;
+    case GLES_AUTO: ret = true; goto done;
+    default: abort();
     }
-    char s[20];
-    snprintf(s, sizeof(s), "%.*s", BSTR_P(param));
-    return mpgl_find_backend(s) >= -1 ? 1 : M_OPT_INVALID;
+
+done:
+    talloc_free(tmp);
+    return ret;
 }
 
-static void *get_native_display(void *pctx, const char *name)
+static void *get_native_display(void *priv, const char *name)
 {
-    MPGLContext *ctx = pctx;
-    if (!ctx->native_display_type || !name)
+    struct priv *p = priv;
+    if (!p->params.native_display_type || !name)
+        return NULL;
+    if (strcmp(p->params.native_display_type, name) != 0)
         return NULL;
-    return strcmp(ctx->native_display_type, name) == 0 ? ctx->native_display : NULL;
+
+    return p->params.native_display;
 }
 
-static MPGLContext *init_backend(struct vo *vo, const struct mpgl_driver *driver,
-                                 bool probing, int vo_flags)
+void ra_gl_ctx_uninit(struct ra_ctx *ctx)
 {
-    MPGLContext *ctx = talloc_ptrtype(NULL, ctx);
-    *ctx = (MPGLContext) {
-        .gl = talloc_zero(ctx, GL),
-        .vo = vo,
-        .global = vo->global,
-        .driver = driver,
-        .log = vo->log,
+    if (ctx->swapchain) {
+        struct priv *p = ctx->swapchain->priv;
+        if (ctx->ra && p->wrapped_fb)
+            ra_tex_free(ctx->ra, &p->wrapped_fb);
+        talloc_free(ctx->swapchain);
+        ctx->swapchain = NULL;
+    }
+
+    ra_free(&ctx->ra);
+}
+
+static const struct ra_swapchain_fns ra_gl_swapchain_fns;
+
+bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params)
+{
+    struct ra_swapchain *sw = ctx->swapchain = talloc_ptrtype(NULL, sw);
+    *sw = (struct ra_swapchain) {
+        .ctx = ctx,
     };
-    if (probing)
-        vo_flags |= VOFLAG_PROBING;
-    bool old_probing = vo->probing;
-    vo->probing = probing; // hack; kill it once backends are separate
-    MP_VERBOSE(vo, "Initializing OpenGL backend '%s'\n", ctx->driver->name);
-    ctx->priv = talloc_zero_size(ctx, ctx->driver->priv_size);
-    if (ctx->driver->init(ctx, vo_flags) < 0) {
-        vo->probing = old_probing;
-        talloc_free(ctx);
-        return NULL;
+
+    struct priv *p = sw->priv = talloc_ptrtype(sw, p);
+    *p = (struct priv) {
+        .gl     = gl,
+        .log    = ctx->log,
+        .params = params,
+        .opts   = mp_get_config_group(p, ctx->global, &opengl_conf),
+        .fns    = ra_gl_swapchain_fns,
+    };
+
+    sw->fns = &p->fns;
+
+    const struct ra_swapchain_fns *ext = p->params.external_swapchain;
+    if (ext) {
+        if (ext->color_depth)
+            p->fns.color_depth = ext->color_depth;
+        if (ext->screenshot)
+            p->fns.screenshot = ext->screenshot;
+        if (ext->start_frame)
+            p->fns.start_frame = ext->start_frame;
+        if (ext->submit_frame)
+            p->fns.submit_frame = ext->submit_frame;
+        if (ext->swap_buffers)
+            p->fns.swap_buffers = ext->swap_buffers;
     }
-    vo->probing = old_probing;
 
-    if (!ctx->gl->version && !ctx->gl->es)
-        goto cleanup;
+    if (!gl->version && !gl->es)
+        return false;
 
-    if (probing && ctx->gl->es && (vo_flags & VOFLAG_NO_GLES)) {
-        MP_VERBOSE(ctx->vo, "Skipping GLES backend.\n");
-        goto cleanup;
+    if (gl->mpgl_caps & MPGL_CAP_SW) {
+        MP_WARN(p, "Suspected software renderer or indirect context.\n");
+        if (ctx->opts.probing && !ctx->opts.allow_sw)
+            return false;
     }
 
-    if (ctx->gl->mpgl_caps & MPGL_CAP_SW) {
-        MP_WARN(ctx->vo, "Suspected software renderer or indirect context.\n");
-        if (vo->probing && !(vo_flags & VOFLAG_SW))
-            goto cleanup;
+    gl->debug_context = ctx->opts.debug;
+    gl->get_native_display_ctx = p;
+    gl->get_native_display = get_native_display;
+
+    if (gl->SwapInterval) {
+        gl->SwapInterval(p->opts->swapinterval);
+    } else {
+        MP_VERBOSE(p, "GL_*_swap_control extension missing.\n");
     }
 
-    ctx->gl->debug_context = !!(vo_flags & VOFLAG_GL_DEBUG);
+    ctx->ra = ra_create_gl(p->gl, ctx->log);
+    return !!ctx->ra;
+}
 
-    ctx->gl->get_native_display_ctx = ctx;
-    ctx->gl->get_native_display = get_native_display;
+void ra_gl_ctx_resize(struct ra_swapchain *sw, int w, int h, int fbo)
+{
+    struct priv *p = sw->priv;
+    if (p->main_fb == fbo && p->wrapped_fb && p->wrapped_fb->params.w == w
+        && p->wrapped_fb->params.h == h)
+        return;
 
-    return ctx;
+    if (p->wrapped_fb)
+        ra_tex_free(sw->ctx->ra, &p->wrapped_fb);
 
-cleanup:
-    mpgl_uninit(ctx);
-    return NULL;
+    p->main_fb = fbo;
+    p->wrapped_fb = ra_create_wrapped_fb(sw->ctx->ra, fbo, w, h);
 }
 
-// Create a VO window and create a GL context on it.
-//  vo_flags: passed to the backend's create window function
-MPGLContext *mpgl_init(struct vo *vo, const char *backend_name, int vo_flags)
+int ra_gl_ctx_color_depth(struct ra_swapchain *sw)
 {
-    MPGLContext *ctx = NULL;
-    int index = mpgl_find_backend(backend_name);
-    if (index == -1) {
-        for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-            ctx = init_backend(vo, backends[n], true, vo_flags);
-            if (ctx)
-                break;
-        }
-        // VO forced, but no backend is ok => force the first that works at all
-        if (!ctx && !vo->probing) {
-            for (int n = 0; n < MP_ARRAY_SIZE(backends); n++) {
-                ctx = init_backend(vo, backends[n], false, vo_flags);
-                if (ctx)
-                    break;
-            }
-        }
-    } else if (index >= 0) {
-        ctx = init_backend(vo, backends[index], false, vo_flags);
-    }
-    return ctx;
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    if (!p->wrapped_fb)
+        return 0;
+
+    if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))
+        return 0;
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, p->main_fb);
+
+    GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
+    if (p->main_fb)
+        obj = GL_COLOR_ATTACHMENT0;
+
+    GLint depth_g = 0;
+
+    gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
+                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
+
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+
+    return depth_g;
 }
 
-int mpgl_reconfig_window(struct MPGLContext *ctx)
+struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw)
 {
-    return ctx->driver->reconfig(ctx);
+    struct priv *p = sw->priv;
+
+    assert(p->wrapped_fb);
+    struct mp_image *screen = gl_read_fbo_contents(p->gl, p->main_fb,
+                                                   p->wrapped_fb->params.w,
+                                                   p->wrapped_fb->params.h);
+
+    // OpenGL FB is also read in flipped order, so we need to flip when the
+    // rendering is *not* flipped, which in our case is whenever
+    // p->params.flipped is true. I hope that made sense
+    if (screen && p->params.flipped)
+        mp_image_vflip(screen);
+
+    return screen;
 }
 
-int mpgl_control(struct MPGLContext *ctx, int *events, int request, void *arg)
+bool ra_gl_ctx_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
 {
-    return ctx->driver->control(ctx, events, request, arg);
+    struct priv *p = sw->priv;
+    out_fbo->tex = p->wrapped_fb;
+    out_fbo->flip = !p->params.flipped; // OpenGL FBs are normally flipped
+    return true;
 }
 
-void mpgl_start_frame(struct MPGLContext *ctx)
+bool ra_gl_ctx_submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
 {
-    if (ctx->driver->start_frame)
-        ctx->driver->start_frame(ctx);
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    if (p->opts->use_glfinish)
+        gl->Finish();
+
+    if (gl->FenceSync && !p->params.external_swapchain) {
+        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+        if (fence)
+            MP_TARRAY_APPEND(p, p->vsync_fences, p->num_vsync_fences, fence);
+    }
+
+    switch (p->opts->early_flush) {
+    case FLUSH_AUTO:
+        if (frame->display_synced)
+            break;
+        // fall through
+    case FLUSH_YES:
+        gl->Flush();
+    }
+
+    return true;
 }
 
-void mpgl_swap_buffers(struct MPGLContext *ctx)
+static void check_pattern(struct priv *p, int item)
 {
-    ctx->driver->swap_buffers(ctx);
+    int expected = p->opts->vsync_pattern[p->last_pattern];
+    if (item == expected) {
+        p->last_pattern++;
+        if (p->last_pattern >= 2)
+            p->last_pattern = 0;
+        p->matches++;
+    } else {
+        p->mismatches++;
+        MP_WARN(p, "wrong pattern, expected %d got %d (hit: %d, mis: %d)\n",
+                expected, item, p->matches, p->mismatches);
+    }
 }
 
-void mpgl_uninit(MPGLContext *ctx)
+void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw)
 {
-    if (ctx)
-        ctx->driver->uninit(ctx);
-    talloc_free(ctx);
+    struct priv *p = sw->priv;
+    GL *gl = p->gl;
+
+    p->params.swap_buffers(sw->ctx);
+    p->frames_rendered++;
+
+    if (p->frames_rendered > 5 && !sw->ctx->opts.debug)
+        ra_gl_set_debug(sw->ctx->ra, false);
+
+    if ((p->opts->waitvsync || p->opts->vsync_pattern[0])
+        && gl->GetVideoSync)
+    {
+        unsigned int n1 = 0, n2 = 0;
+        gl->GetVideoSync(&n1);
+        if (p->opts->waitvsync)
+            gl->WaitVideoSync(2, (n1 + 1) % 2, &n2);
+        int step = n1 - p->prev_sgi_sync_count;
+        p->prev_sgi_sync_count = n1;
+        MP_DBG(p, "Flip counts: %u->%u, step=%d\n", n1, n2, step);
+        if (p->opts->vsync_pattern[0])
+            check_pattern(p, step);
+    }
+
+    while (p->num_vsync_fences >= sw->ctx->opts.swapchain_depth) {
+        gl->ClientWaitSync(p->vsync_fences[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
+        gl->DeleteSync(p->vsync_fences[0]);
+        MP_TARRAY_REMOVE_AT(p->vsync_fences, p->num_vsync_fences, 0);
+    }
 }
+
+static const struct ra_swapchain_fns ra_gl_swapchain_fns = {
+    .color_depth   = ra_gl_ctx_color_depth,
+    .screenshot    = ra_gl_ctx_screenshot,
+    .start_frame   = ra_gl_ctx_start_frame,
+    .submit_frame  = ra_gl_ctx_submit_frame,
+    .swap_buffers  = ra_gl_ctx_swap_buffers,
+};
diff --git a/video/out/opengl/context.h b/video/out/opengl/context.h
index 229c5ef..95ed374 100644
--- a/video/out/opengl/context.h
+++ b/video/out/opengl/context.h
@@ -1,116 +1,56 @@
-/*
- * common OpenGL routines
- *
- * copyleft (C) 2005-2010 Reimar Döffinger <Reimar.Doeffinger@gmx.de>
- * Special thanks go to the xine team and Matthias Hopf, whose video_out_opengl.c
- * gave me lots of good ideas.
- *
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_CONTEXT_H_
-#define MP_GL_CONTEXT_H_
+#pragma once
 
+#include "common/global.h"
+#include "video/out/gpu/context.h"
 #include "common.h"
 
-enum {
-    VOFLAG_GLES         = 1 << 0,       // Hint to create a GLES context
-    VOFLAG_NO_GLES      = 1 << 1,       // Hint to create a desktop GL context
-    VOFLAG_GL_DEBUG     = 1 << 2,       // Hint to request debug OpenGL context
-    VOFLAG_ALPHA        = 1 << 3,       // Hint to request alpha framebuffer
-    VOFLAG_SW           = 1 << 4,       // Hint to accept a software GL renderer
-    VOFLAG_PROBING      = 1 << 6,       // The backend is being auto-probed.
-    VOFLAG_GLES2        = 1 << 7,       // Hint for GLESv2 (needs VOFLAG_GLES)
-};
-
 extern const int mpgl_preferred_gl_versions[];
 
-struct MPGLContext;
-
-// A windowing backend (like X11, win32, ...), which provides OpenGL rendering.
-struct mpgl_driver {
-    const char *name;
-
-    // Size of the struct allocated for MPGLContext.priv
-    int priv_size;
-
-    // Init the GL context and possibly the underlying VO backend.
-    // The created context should be compatible to GL 3.2 core profile, but
-    // some other GL versions are supported as well (e.g. GL 2.1 or GLES 2).
-    // Return 0 on success, negative value (-1) on error.
-    int (*init)(struct MPGLContext *ctx, int vo_flags);
-
-    // Resize the window, or create a new window if there isn't one yet.
-    // Currently, there is an unfortunate interaction with ctx->vo, and
-    // display size etc. are determined by it.
-    // Return 0 on success, negative value (-1) on error.
-    int (*reconfig)(struct MPGLContext *ctx);
-
-    // Called when rendering starts. The backend can map or resize the
-    // framebuffer, or update GL.main_fb. swap_buffers() ends the frame.
-    // Optional.
-    void (*start_frame)(struct MPGLContext *ctx);
-
-    // Present the frame.
-    void (*swap_buffers)(struct MPGLContext *ctx);
-
-    // This behaves exactly like vo_driver.control().
-    int (*control)(struct MPGLContext *ctx, int *events, int request, void *arg);
-
-    // These behave exactly like vo_driver.wakeup/wait_events. They are
-    // optional.
-    void (*wakeup)(struct MPGLContext *ctx);
-    void (*wait_events)(struct MPGLContext *ctx, int64_t until_time_us);
-
-    // Destroy the GL context and possibly the underlying VO backend.
-    void (*uninit)(struct MPGLContext *ctx);
-};
-
-typedef struct MPGLContext {
-    GL *gl;
-    struct vo *vo;
-    const struct mpgl_driver *driver;
-    struct mpv_global *global;
-    struct mp_log *log;
-
-    // For hwdec_vaegl.c.
+// Returns whether or not a candidate GL version should be accepted or not
+// (based on the --opengl opts). Implementations may call this before
+// ra_gl_ctx_init if they wish to probe for multiple possible GL versions.
+bool ra_gl_ctx_test_version(struct ra_ctx *ctx, int version, bool es);
+
+// These are a set of helpers for ra_ctx providers based on ra_gl.
+// The init function also initializes ctx->ra and ctx->swapchain, so the user
+// doesn't have to do this manually. (Similarly, the uninit function will
+// clean them up)
+
+struct ra_gl_ctx_params {
+    // Set to the platform-specific function to swap buffers, like
+    // glXSwapBuffers, eglSwapBuffers etc. This will be called by
+    // ra_gl_ctx_swap_buffers. Required unless you either never call that
+    // function or if you override it yourself.
+    void (*swap_buffers)(struct ra_ctx *ctx);
+
+    // Set to false if the implementation follows normal GL semantics, which is
+    // upside down. Set to true if it does *not*, i.e. if rendering is right
+    // side up
+    bool flipped;
+
+    // If this is set to non-NULL, then the ra_gl_ctx will consider the GL
+    // implementation to be using an external swapchain, which disables the
+    // software simulation of --swapchain-depth. Any functions defined by this
+    // ra_swapchain_fns structs will entirely replace the equivalent ra_gl_ctx
+    // functions in the resulting ra_swapchain.
+    const struct ra_swapchain_fns *external_swapchain;
+
+    // For hwdec_vaegl.c:
     const char *native_display_type;
     void *native_display;
+};
 
-    // Flip the rendered image vertically. This is useful for dxinterop.
-    bool flip_v;
-
-    // framebuffer to render to (normally 0)
-    GLuint main_fb;
-
-    // For free use by the mpgl_driver.
-    void *priv;
-} MPGLContext;
-
-MPGLContext *mpgl_init(struct vo *vo, const char *backend_name, int vo_flags);
-void mpgl_uninit(MPGLContext *ctx);
-int mpgl_reconfig_window(struct MPGLContext *ctx);
-int mpgl_control(struct MPGLContext *ctx, int *events, int request, void *arg);
-void mpgl_start_frame(struct MPGLContext *ctx);
-void mpgl_swap_buffers(struct MPGLContext *ctx);
-
-int mpgl_find_backend(const char *name);
+void ra_gl_ctx_uninit(struct ra_ctx *ctx);
+bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params);
 
-struct m_option;
-int mpgl_validate_backend_opt(struct mp_log *log, const struct m_option *opt,
-                              struct bstr name, struct bstr param);
+// Call this any time the window size or main framebuffer changes
+void ra_gl_ctx_resize(struct ra_swapchain *sw, int w, int h, int fbo);
 
-#endif
+// These functions are normally set in the ra_swapchain->fns, but if an
+// implementation has a need to override this fns struct with custom functions
+// for whatever reason, these can be used to inherit the original behavior.
+int ra_gl_ctx_color_depth(struct ra_swapchain *sw);
+struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw);
+bool ra_gl_ctx_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo);
+bool ra_gl_ctx_submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame);
+void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw);
diff --git a/video/out/opengl/context_android.c b/video/out/opengl/context_android.c
new file mode 100644
index 0000000..a2acce2
--- /dev/null
+++ b/video/out/opengl/context_android.c
@@ -0,0 +1,152 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+#include <libavcodec/jni.h>
+#include <android/native_window_jni.h>
+
+#include "egl_helpers.h"
+
+#include "common/common.h"
+#include "options/m_config.h"
+#include "context.h"
+
+struct priv {
+    struct GL gl;
+    EGLDisplay egl_display;
+    EGLConfig egl_config;
+    EGLContext egl_context;
+    EGLSurface egl_surface;
+    ANativeWindow *native_window;
+};
+
+static void android_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static void android_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
+    if (p->egl_surface) {
+        eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
+                       EGL_NO_CONTEXT);
+        eglDestroySurface(p->egl_display, p->egl_surface);
+    }
+    if (p->egl_context)
+        eglDestroyContext(p->egl_display, p->egl_context);
+
+    if (p->native_window) {
+        ANativeWindow_release(p->native_window);
+        p->native_window = NULL;
+    }
+}
+
+static bool android_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+
+    jobject surface = (jobject)(intptr_t)ctx->vo->opts->WinID;
+    JavaVM *vm = (JavaVM *)av_jni_get_java_vm(NULL);
+    JNIEnv *env;
+    int ret = (*vm)->GetEnv(vm, (void**)&env, JNI_VERSION_1_6);
+    if (ret == JNI_EDETACHED) {
+        if ((*vm)->AttachCurrentThread(vm, &env, NULL) != 0) {
+            MP_FATAL(ctx, "Could not attach java VM.\n");
+            goto fail;
+        }
+    }
+    p->native_window = ANativeWindow_fromSurface(env, surface);
+    (*vm)->DetachCurrentThread(vm);
+
+    p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
+    if (!eglInitialize(p->egl_display, NULL, NULL)) {
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
+        goto fail;
+    }
+
+    EGLConfig config;
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &config))
+        goto fail;
+
+    EGLint format;
+    eglGetConfigAttrib(p->egl_display, config, EGL_NATIVE_VISUAL_ID, &format);
+    ANativeWindow_setBuffersGeometry(p->native_window, 0, 0, format);
+
+    p->egl_surface = eglCreateWindowSurface(p->egl_display, config,
+                                    (EGLNativeWindowType)p->native_window, NULL);
+
+    if (p->egl_surface == EGL_NO_SURFACE) {
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
+        goto fail;
+    }
+
+    if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
+                        p->egl_context)) {
+        MP_FATAL(ctx, "Failed to set context!\n");
+        goto fail;
+    }
+
+    mpegl_load_functions(&p->gl, ctx->log);
+
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = android_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
+
+    return true;
+fail:
+    android_uninit(ctx);
+    return false;
+}
+
+static bool android_reconfig(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    int w, h;
+
+    if (!eglQuerySurface(p->egl_display, p->egl_surface, EGL_WIDTH, &w) ||
+        !eglQuerySurface(p->egl_display, p->egl_surface, EGL_HEIGHT, &h)) {
+        MP_FATAL(ctx, "Failed to get height and width!\n");
+        return false;
+    }
+
+    ctx->vo->dwidth = w;
+    ctx->vo->dheight = h;
+    ra_gl_ctx_resize(ctx->swapchain, w, h, 0);
+    return true;
+}
+
+static int android_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    return VO_NOTIMPL;
+}
+
+const struct ra_ctx_fns ra_ctx_android = {
+    .type           = "opengl",
+    .name           = "android",
+    .reconfig       = android_reconfig,
+    .control        = android_control,
+    .init           = android_init,
+    .uninit         = android_uninit,
+};
diff --git a/video/out/opengl/context_angle.c b/video/out/opengl/context_angle.c
index f249b74..986a503 100644
--- a/video/out/opengl/context_angle.c
+++ b/video/out/opengl/context_angle.c
@@ -24,13 +24,14 @@
 
 #include "angle_dynamic.h"
 #include "egl_helpers.h"
-#include "d3d11_helpers.h"
+#include "video/out/gpu/d3d11_helpers.h"
 
 #include "common/common.h"
 #include "options/m_config.h"
 #include "video/out/w32_common.h"
 #include "osdep/windows_utils.h"
 #include "context.h"
+#include "utils.h"
 
 #ifndef EGL_D3D_TEXTURE_ANGLE
 #define EGL_D3D_TEXTURE_ANGLE 0x33A3
@@ -52,8 +53,6 @@ struct angle_opts {
     int d3d11_warp;
     int d3d11_feature_level;
     int egl_windowing;
-    int swapchain_length; // Currently only works with DXGI 1.2+
-    int max_frame_latency;
     int flip;
 };
 
@@ -77,9 +76,9 @@ const struct m_sub_options angle_conf = {
                    ({"auto", -1},
                     {"no", 0},
                     {"yes", 1})),
-        OPT_INTRANGE("angle-swapchain-length", swapchain_length, 0, 2, 16),
-        OPT_INTRANGE("angle-max-frame-latency", max_frame_latency, 0, 1, 16),
         OPT_FLAG("angle-flip", flip, 0),
+        OPT_REPLACED("angle-max-frame-latency", "swapchain-depth"),
+        OPT_REMOVED("angle-swapchain-length", "controlled by --swapchain-depth"),
         {0}
     },
     .defaults = &(const struct angle_opts) {
@@ -87,14 +86,14 @@ const struct m_sub_options angle_conf = {
         .d3d11_warp = -1,
         .d3d11_feature_level = D3D_FEATURE_LEVEL_11_0,
         .egl_windowing = -1,
-        .swapchain_length = 6,
-        .max_frame_latency = 3,
         .flip = 1,
     },
     .size = sizeof(struct angle_opts),
 };
 
 struct priv {
+    GL gl;
+
     IDXGISwapChain *dxgi_swapchain;
 
     ID3D11Device *d3d11_device;
@@ -110,20 +109,21 @@ struct priv {
 
     int sc_width, sc_height; // Swap chain width and height
     int swapinterval;
+    bool flipped;
 
     struct angle_opts *opts;
 };
 
-static __thread struct MPGLContext *current_ctx;
+static __thread struct ra_ctx *current_ctx;
 
-static void update_sizes(MPGLContext *ctx)
+static void update_sizes(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     p->sc_width = ctx->vo->dwidth ? ctx->vo->dwidth : 1;
     p->sc_height = ctx->vo->dheight ? ctx->vo->dheight : 1;
 }
 
-static void d3d11_backbuffer_release(MPGLContext *ctx)
+static void d3d11_backbuffer_release(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -137,7 +137,7 @@ static void d3d11_backbuffer_release(MPGLContext *ctx)
     SAFE_RELEASE(p->d3d11_backbuffer);
 }
 
-static bool d3d11_backbuffer_get(MPGLContext *ctx)
+static bool d3d11_backbuffer_get(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -168,7 +168,7 @@ static bool d3d11_backbuffer_get(MPGLContext *ctx)
     return true;
 }
 
-static void d3d11_backbuffer_resize(MPGLContext *ctx)
+static void d3d11_backbuffer_resize(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -197,7 +197,7 @@ static void d3d11_backbuffer_resize(MPGLContext *ctx)
         MP_FATAL(vo, "Couldn't get back buffer after resize\n");
 }
 
-static void d3d11_device_destroy(MPGLContext *ctx)
+static void d3d11_device_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -215,7 +215,7 @@ static void d3d11_device_destroy(MPGLContext *ctx)
     SAFE_RELEASE(p->d3d11_device);
 }
 
-static bool d3d11_device_create(MPGLContext *ctx, int flags)
+static bool d3d11_device_create(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -226,7 +226,7 @@ static bool d3d11_device_create(MPGLContext *ctx, int flags)
         .force_warp = o->d3d11_warp == 1,
         .max_feature_level = o->d3d11_feature_level,
         .min_feature_level = D3D_FEATURE_LEVEL_9_3,
-        .max_frame_latency = o->max_frame_latency,
+        .max_frame_latency = ctx->opts.swapchain_depth,
     };
     if (!mp_d3d11_create_present_device(vo->log, &device_opts, &p->d3d11_device))
         return false;
@@ -262,7 +262,7 @@ static bool d3d11_device_create(MPGLContext *ctx, int flags)
     return true;
 }
 
-static void d3d11_swapchain_surface_destroy(MPGLContext *ctx)
+static void d3d11_swapchain_surface_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -277,7 +277,7 @@ static void d3d11_swapchain_surface_destroy(MPGLContext *ctx)
         ID3D11DeviceContext_Flush(p->d3d11_context);
 }
 
-static bool d3d11_swapchain_surface_create(MPGLContext *ctx, int flags)
+static bool d3d11_swapchain_surface_create(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -292,7 +292,9 @@ static bool d3d11_swapchain_surface_create(MPGLContext *ctx, int flags)
         .width = p->sc_width,
         .height = p->sc_height,
         .flip = o->flip,
-        .length = o->swapchain_length,
+        // Add one frame for the backbuffer and one frame of "slack" to reduce
+        // contention with the window manager when acquiring the backbuffer
+        .length = ctx->opts.swapchain_depth + 2,
         .usage = DXGI_USAGE_RENDER_TARGET_OUTPUT | DXGI_USAGE_SHADER_INPUT,
     };
     if (!mp_d3d11_create_swapchain(p->d3d11_device, vo->log, &swapchain_opts,
@@ -301,8 +303,7 @@ static bool d3d11_swapchain_surface_create(MPGLContext *ctx, int flags)
     if (!d3d11_backbuffer_get(ctx))
         goto fail;
 
-    // EGL_D3D_TEXTURE_ANGLE pbuffers are always flipped vertically
-    ctx->flip_v = true;
+    p->flipped = true;
     return true;
 
 fail:
@@ -310,7 +311,7 @@ fail:
     return false;
 }
 
-static void d3d9_device_destroy(MPGLContext *ctx)
+static void d3d9_device_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -319,7 +320,7 @@ static void d3d9_device_destroy(MPGLContext *ctx)
     p->egl_display = EGL_NO_DISPLAY;
 }
 
-static bool d3d9_device_create(MPGLContext *ctx, int flags)
+static bool d3d9_device_create(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -348,7 +349,7 @@ static bool d3d9_device_create(MPGLContext *ctx, int flags)
     return true;
 }
 
-static void egl_window_surface_destroy(MPGLContext *ctx)
+static void egl_window_surface_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     if (p->egl_window) {
@@ -357,7 +358,7 @@ static void egl_window_surface_destroy(MPGLContext *ctx)
     }
 }
 
-static bool egl_window_surface_create(MPGLContext *ctx, int flags)
+static bool egl_window_surface_create(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -374,7 +375,7 @@ static bool egl_window_surface_create(MPGLContext *ctx, int flags)
                 EGL_SURFACE_ORIENTATION_ANGLE);
             MP_TARRAY_APPEND(NULL, window_attribs, window_attribs_len,
                 EGL_SURFACE_ORIENTATION_INVERT_Y_ANGLE);
-            ctx->flip_v = true;
+            p->flipped = true;
             MP_VERBOSE(vo, "Rendering flipped.\n");
         }
     }
@@ -396,7 +397,7 @@ fail:
     return false;
 }
 
-static void context_destroy(struct MPGLContext *ctx)
+static void context_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     if (p->egl_context) {
@@ -407,7 +408,7 @@ static void context_destroy(struct MPGLContext *ctx)
     p->egl_context = EGL_NO_CONTEXT;
 }
 
-static bool context_init(struct MPGLContext *ctx, int flags)
+static bool context_init(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
@@ -421,8 +422,8 @@ static bool context_init(struct MPGLContext *ctx, int flags)
     if (exts)
         MP_DBG(vo, "EGL extensions: %s\n", exts);
 
-    if (!mpegl_create_context(p->egl_display, vo->log, flags | VOFLAG_GLES,
-                              &p->egl_context, &p->egl_config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context,
+                              &p->egl_config))
     {
         MP_FATAL(vo, "Could not create EGL context!\n");
         goto fail;
@@ -434,10 +435,12 @@ fail:
     return false;
 }
 
-static void angle_uninit(struct MPGLContext *ctx)
+static void angle_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
+    ra_gl_ctx_uninit(ctx);
+
     DwmEnableMMCSS(FALSE);
 
     // Uninit the EGL surface implementation that is being used. Note: This may
@@ -474,17 +477,88 @@ static int GLAPIENTRY angle_swap_interval(int interval)
     }
 }
 
-static int angle_init(struct MPGLContext *ctx, int flags)
+static void d3d11_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    // Calling Present() on a flip-sequential swap chain will silently change
+    // the underlying storage of the back buffer to point to the next buffer in
+    // the chain. This results in the RTVs for the back buffer becoming
+    // unbound. Since ANGLE doesn't know we called Present(), it will continue
+    // using the unbound RTVs, so we must save and restore them ourselves.
+    ID3D11RenderTargetView *rtvs[D3D11_SIMULTANEOUS_RENDER_TARGET_COUNT] = {0};
+    ID3D11DepthStencilView *dsv = NULL;
+    ID3D11DeviceContext_OMGetRenderTargets(p->d3d11_context,
+        MP_ARRAY_SIZE(rtvs), rtvs, &dsv);
+
+    HRESULT hr = IDXGISwapChain_Present(p->dxgi_swapchain, p->swapinterval, 0);
+    if (FAILED(hr))
+        MP_FATAL(ctx->vo, "Couldn't present: %s\n", mp_HRESULT_to_str(hr));
+
+    // Restore the RTVs and release the objects
+    ID3D11DeviceContext_OMSetRenderTargets(p->d3d11_context,
+        MP_ARRAY_SIZE(rtvs), rtvs, dsv);
+    for (int i = 0; i < MP_ARRAY_SIZE(rtvs); i++)
+        SAFE_RELEASE(rtvs[i]);
+    SAFE_RELEASE(dsv);
+}
+
+static void egl_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    eglSwapBuffers(p->egl_display, p->egl_window);
+}
+
+static void angle_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    if (p->dxgi_swapchain)
+        d3d11_swap_buffers(ctx);
+    else
+        egl_swap_buffers(ctx);
+}
+
+
+static int angle_color_depth(struct ra_swapchain *sw)
+{
+    // Only 8-bit output is supported at the moment
+    return 8;
+}
+
+static struct mp_image *angle_screenshot(struct ra_swapchain *sw)
+{
+    struct priv *p = sw->ctx->priv;
+    if (p->dxgi_swapchain) {
+        struct mp_image *img = mp_d3d11_screenshot(p->dxgi_swapchain);
+        if (img)
+            return img;
+    }
+    return ra_gl_ctx_screenshot(sw);
+}
+
+static bool angle_submit_frame(struct ra_swapchain *sw,
+                               const struct vo_frame *frame)
+{
+    struct priv *p = sw->ctx->priv;
+    bool ret = ra_gl_ctx_submit_frame(sw, frame);
+    if (p->d3d11_context) {
+        // DXGI Present doesn't flush the immediate context, which can make
+        // timers inaccurate, since the end queries might not be sent until the
+        // next frame. Fix this by flushing the context now.
+        ID3D11DeviceContext_Flush(p->d3d11_context);
+    }
+    return ret;
+}
+
+static bool angle_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     struct vo *vo = ctx->vo;
+    GL *gl = &p->gl;
 
     p->opts = mp_get_config_group(ctx, ctx->global, &angle_conf);
     struct angle_opts *o = p->opts;
 
-    // DWM MMCSS cargo-cult. The dxinterop backend also does this.
-    DwmEnableMMCSS(TRUE);
-
     if (!angle_load()) {
         MP_VERBOSE(vo, "Failed to load LIBEGL.DLL\n");
         goto fail;
@@ -493,19 +567,19 @@ static int angle_init(struct MPGLContext *ctx, int flags)
     // Create the underlying EGL device implementation
     bool context_ok = false;
     if ((!context_ok && !o->renderer) || o->renderer == RENDERER_D3D11) {
-        context_ok = d3d11_device_create(ctx, flags);
+        context_ok = d3d11_device_create(ctx);
         if (context_ok) {
-            context_ok = context_init(ctx, flags);
+            context_ok = context_init(ctx);
             if (!context_ok)
                 d3d11_device_destroy(ctx);
         }
     }
     if ((!context_ok && !o->renderer) || o->renderer == RENDERER_D3D9) {
-        context_ok = d3d9_device_create(ctx, flags);
+        context_ok = d3d9_device_create(ctx);
         if (context_ok) {
             MP_VERBOSE(vo, "Using Direct3D 9\n");
 
-            context_ok = context_init(ctx, flags);
+            context_ok = context_init(ctx);
             if (!context_ok)
                 d3d9_device_destroy(ctx);
         }
@@ -519,181 +593,74 @@ static int angle_init(struct MPGLContext *ctx, int flags)
     // Create the underlying EGL surface implementation
     bool surface_ok = false;
     if ((!surface_ok && o->egl_windowing == -1) || o->egl_windowing == 0) {
-        surface_ok = d3d11_swapchain_surface_create(ctx, flags);
+        surface_ok = d3d11_swapchain_surface_create(ctx);
     }
     if ((!surface_ok && o->egl_windowing == -1) || o->egl_windowing == 1) {
-        surface_ok = egl_window_surface_create(ctx, flags);
+        surface_ok = egl_window_surface_create(ctx);
         if (surface_ok)
             MP_VERBOSE(vo, "Using EGL windowing\n");
     }
     if (!surface_ok)
         goto fail;
 
-    mpegl_load_functions(ctx->gl, vo->log);
+    mpegl_load_functions(gl, vo->log);
 
     current_ctx = ctx;
-    ctx->gl->SwapInterval = angle_swap_interval;
-
-    return 0;
-fail:
-    angle_uninit(ctx);
-    return -1;
-}
-
-static int angle_reconfig(struct MPGLContext *ctx)
-{
-    vo_w32_config(ctx->vo);
-    return 0;
-}
+    gl->SwapInterval = angle_swap_interval;
 
-static struct mp_image *d3d11_screenshot(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    ID3D11Texture2D *frontbuffer = NULL;
-    ID3D11Texture2D *staging = NULL;
-    struct mp_image *img = NULL;
-    HRESULT hr;
-
-    if (!p->dxgi_swapchain)
-        goto done;
-
-    // Validate the swap chain. This screenshot method will only work on DXGI
-    // 1.2+ flip/sequential swap chains. It's probably not possible at all with
-    // discard swap chains, since by definition, the backbuffer contents is
-    // discarded on Present().
-    DXGI_SWAP_CHAIN_DESC scd;
-    hr = IDXGISwapChain_GetDesc(p->dxgi_swapchain, &scd);
-    if (FAILED(hr))
-        goto done;
-    if (scd.SwapEffect != DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL)
-        goto done;
-
-    // Get the last buffer that was presented with Present(). This should be
-    // the n-1th buffer for a swap chain of length n.
-    hr = IDXGISwapChain_GetBuffer(p->dxgi_swapchain, scd.BufferCount - 1,
-        &IID_ID3D11Texture2D, (void**)&frontbuffer);
-    if (FAILED(hr))
-        goto done;
-
-    D3D11_TEXTURE2D_DESC td;
-    ID3D11Texture2D_GetDesc(frontbuffer, &td);
-    if (td.SampleDesc.Count > 1)
-        goto done;
-
-    // Validate the backbuffer format and convert to an mpv IMGFMT
-    enum mp_imgfmt fmt;
-    switch (td.Format) {
-    case DXGI_FORMAT_B8G8R8A8_UNORM: fmt = IMGFMT_BGR0; break;
-    case DXGI_FORMAT_R8G8B8A8_UNORM: fmt = IMGFMT_RGB0; break;
-    default:
-        goto done;
-    }
-
-    // Create a staging texture based on the frontbuffer with CPU access
-    td.BindFlags = 0;
-    td.MiscFlags = 0;
-    td.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
-    td.Usage = D3D11_USAGE_STAGING;
-    hr = ID3D11Device_CreateTexture2D(p->d3d11_device, &td, 0, &staging);
-    if (FAILED(hr))
-        goto done;
-
-    ID3D11DeviceContext_CopyResource(p->d3d11_context,
-        (ID3D11Resource*)staging, (ID3D11Resource*)frontbuffer);
-
-    // Attempt to map the staging texture to CPU-accessible memory
-    D3D11_MAPPED_SUBRESOURCE lock;
-    hr = ID3D11DeviceContext_Map(p->d3d11_context, (ID3D11Resource*)staging,
-                                 0, D3D11_MAP_READ, 0, &lock);
-    if (FAILED(hr))
-        goto done;
-
-    img = mp_image_alloc(fmt, td.Width, td.Height);
-    if (!img)
-        return NULL;
-    for (int i = 0; i < td.Height; i++) {
-        memcpy(img->planes[0] + img->stride[0] * i,
-               (char*)lock.pData + lock.RowPitch * i, td.Width * 4);
-    }
-
-    ID3D11DeviceContext_Unmap(p->d3d11_context, (ID3D11Resource*)staging, 0);
-
-done:
-    SAFE_RELEASE(frontbuffer);
-    SAFE_RELEASE(staging);
-    return img;
-}
+    // Custom swapchain impl for the D3D11 swapchain-based surface
+    static const struct ra_swapchain_fns dxgi_swapchain_fns = {
+        .color_depth = angle_color_depth,
+        .screenshot = angle_screenshot,
+        .submit_frame = angle_submit_frame,
+    };
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = angle_swap_buffers,
+        .flipped = p->flipped,
+        .external_swapchain = p->dxgi_swapchain ? &dxgi_swapchain_fns : NULL,
+    };
 
-static int angle_control(MPGLContext *ctx, int *events, int request, void *arg)
-{
-    struct priv *p = ctx->priv;
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto fail;
 
-    // Try a D3D11-specific method of taking a window screenshot
-    if (request == VOCTRL_SCREENSHOT_WIN) {
-        struct mp_image *img = d3d11_screenshot(ctx);
-        if (img) {
-            *(struct mp_image **)arg = img;
-            return true;
-        }
-    }
+    DwmEnableMMCSS(TRUE); // DWM MMCSS cargo-cult. The dxgl backend also does this.
 
-    int r = vo_w32_control(ctx->vo, events, request, arg);
-    if (*events & VO_EVENT_RESIZE) {
-        if (p->dxgi_swapchain)
-            d3d11_backbuffer_resize(ctx);
-        else
-            eglWaitClient(); // Should get ANGLE to resize its swapchain
-    }
-    return r;
+    return true;
+fail:
+    angle_uninit(ctx);
+    return false;
 }
 
-static void d3d11_swap_buffers(MPGLContext *ctx)
+static void resize(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-
-    // Calling Present() on a flip-sequential swap chain will silently change
-    // the underlying storage of the back buffer to point to the next buffer in
-    // the chain. This results in the RTVs for the back buffer becoming
-    // unbound. Since ANGLE doesn't know we called Present(), it will continue
-    // using the unbound RTVs, so we must save and restore them ourselves.
-    ID3D11RenderTargetView *rtvs[D3D11_SIMULTANEOUS_RENDER_TARGET_COUNT] = {0};
-    ID3D11DepthStencilView *dsv = NULL;
-    ID3D11DeviceContext_OMGetRenderTargets(p->d3d11_context,
-        MP_ARRAY_SIZE(rtvs), rtvs, &dsv);
-
-    HRESULT hr = IDXGISwapChain_Present(p->dxgi_swapchain, p->swapinterval, 0);
-    if (FAILED(hr))
-        MP_FATAL(ctx->vo, "Couldn't present: %s\n", mp_HRESULT_to_str(hr));
-
-    // Restore the RTVs and release the objects
-    ID3D11DeviceContext_OMSetRenderTargets(p->d3d11_context,
-        MP_ARRAY_SIZE(rtvs), rtvs, dsv);
-    for (int i = 0; i < MP_ARRAY_SIZE(rtvs); i++)
-        SAFE_RELEASE(rtvs[i]);
-    SAFE_RELEASE(dsv);
+    if (p->dxgi_swapchain)
+        d3d11_backbuffer_resize(ctx);
+    else
+        eglWaitClient(); // Should get ANGLE to resize its swapchain
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
 }
 
-static void egl_swap_buffers(MPGLContext *ctx)
+static bool angle_reconfig(struct ra_ctx *ctx)
 {
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_window);
+    vo_w32_config(ctx->vo);
+    resize(ctx);
+    return true;
 }
 
-static void angle_swap_buffers(MPGLContext *ctx)
+static int angle_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
-    struct priv *p = ctx->priv;
-    if (p->dxgi_swapchain)
-        d3d11_swap_buffers(ctx);
-    else
-        egl_swap_buffers(ctx);
+    int ret = vo_w32_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-const struct mpgl_driver mpgl_driver_angle = {
+const struct ra_ctx_fns ra_ctx_angle = {
+    .type           = "opengl",
     .name           = "angle",
-    .priv_size      = sizeof(struct priv),
     .init           = angle_init,
     .reconfig       = angle_reconfig,
-    .swap_buffers   = angle_swap_buffers,
     .control        = angle_control,
     .uninit         = angle_uninit,
 };
diff --git a/video/out/opengl/context_cocoa.c b/video/out/opengl/context_cocoa.c
index 1d9a10c..2256d31 100644
--- a/video/out/opengl/context_cocoa.c
+++ b/video/out/opengl/context_cocoa.c
@@ -36,6 +36,7 @@ const struct m_sub_options cocoa_conf = {
 };
 
 struct priv {
+    GL gl;
     CGLPixelFormatObj pix;
     CGLContextObj ctx;
 
@@ -62,7 +63,7 @@ static void *cocoa_glgetaddr(const char *s)
     return ret;
 }
 
-static CGLError test_gl_version(struct MPGLContext *ctx, CGLOpenGLProfile ver)
+static CGLError test_gl_version(struct ra_ctx *ctx, CGLOpenGLProfile ver)
 {
     struct priv *p = ctx->priv;
 
@@ -107,9 +108,10 @@ error_out:
     return err;
 }
 
-static bool create_gl_context(struct MPGLContext *ctx, int vo_flags)
+static bool create_gl_context(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    GL *gl = &p->gl;
     CGLError err;
 
     CGLOpenGLProfile gl_versions[] = {
@@ -132,60 +134,83 @@ static bool create_gl_context(struct MPGLContext *ctx, int vo_flags)
     vo_cocoa_set_opengl_ctx(ctx->vo, p->ctx);
     CGLSetCurrentContext(p->ctx);
 
-    if (vo_flags & VOFLAG_ALPHA)
+    if (ctx->opts.want_alpha)
         CGLSetParameter(p->ctx, kCGLCPSurfaceOpacity, &(GLint){0});
 
-    mpgl_load_functions(ctx->gl, (void *)cocoa_glgetaddr, NULL, ctx->vo->log);
+    mpgl_load_functions(gl, (void *)cocoa_glgetaddr, NULL, ctx->vo->log);
+    gl->SwapInterval = set_swap_interval;
 
     CGLReleasePixelFormat(p->pix);
 
     return true;
 }
 
-static void cocoa_uninit(MPGLContext *ctx)
+static void cocoa_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
     CGLReleaseContext(p->ctx);
     vo_cocoa_uninit(ctx->vo);
 }
 
-static int cocoa_init(MPGLContext *ctx, int vo_flags)
+static void cocoa_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    GL *gl = &p->gl;
+    vo_cocoa_swap_buffers(ctx->vo);
+    gl->Flush();
+}
+
+static bool cocoa_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    GL *gl = &p->gl;
     p->opts = mp_get_config_group(ctx, ctx->global, &cocoa_conf);
     vo_cocoa_init(ctx->vo);
 
-    if (!create_gl_context(ctx, vo_flags))
-        return -1;
+    if (!create_gl_context(ctx))
+        goto fail;
+
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = cocoa_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto fail;
+
+    return true;
 
-    ctx->gl->SwapInterval = set_swap_interval;
-    return 0;
+fail:
+    cocoa_uninit(ctx);
+    return false;
 }
 
-static int cocoa_reconfig(struct MPGLContext *ctx)
+static void resize(struct ra_ctx *ctx)
 {
-    vo_cocoa_config_window(ctx->vo);
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
 }
 
-static int cocoa_control(struct MPGLContext *ctx, int *events, int request,
-                         void *arg)
+static bool cocoa_reconfig(struct ra_ctx *ctx)
 {
-    return vo_cocoa_control(ctx->vo, events, request, arg);
+    vo_cocoa_config_window(ctx->vo);
+    resize(ctx);
+    return true;
 }
 
-static void cocoa_swap_buffers(struct MPGLContext *ctx)
+static int cocoa_control(struct ra_ctx *ctx, int *events, int request,
+                         void *arg)
 {
-    vo_cocoa_swap_buffers(ctx->vo);
-    ctx->gl->Flush();
+    int ret = vo_cocoa_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-const struct mpgl_driver mpgl_driver_cocoa = {
+const struct ra_ctx_fns ra_ctx_cocoa = {
+    .type           = "opengl",
     .name           = "cocoa",
-    .priv_size      = sizeof(struct priv),
     .init           = cocoa_init,
     .reconfig       = cocoa_reconfig,
-    .swap_buffers   = cocoa_swap_buffers,
     .control        = cocoa_control,
     .uninit         = cocoa_uninit,
-};
-\ No newline at end of file
+};
diff --git a/video/out/opengl/context_drm_egl.c b/video/out/opengl/context_drm_egl.c
index e52fec4..6191309 100644
--- a/video/out/opengl/context_drm_egl.c
+++ b/video/out/opengl/context_drm_egl.c
@@ -25,22 +25,25 @@
 #include <unistd.h>
 
 #include <gbm.h>
+#include <drm_fourcc.h>
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "context.h"
-#include "egl_helpers.h"
-#include "common/common.h"
+#include "libmpv/opengl_cb.h"
 #include "video/out/drm_common.h"
+#include "common/common.h"
+
+#include "egl_helpers.h"
+#include "common.h"
+#include "context.h"
 
 #define USE_MASTER 0
 
 struct framebuffer
 {
-    struct gbm_bo *bo;
-    int width, height;
     int fd;
-    int id;
+    uint32_t width, height;
+    uint32_t id;
 };
 
 struct gbm
@@ -59,6 +62,7 @@ struct egl
 };
 
 struct priv {
+    GL gl;
     struct kms *kms;
 
     drmEventContext ev;
@@ -66,43 +70,46 @@ struct priv {
 
     struct egl egl;
     struct gbm gbm;
-    struct framebuffer fb;
+    struct framebuffer *fb;
+
+    uint32_t primary_plane_format;
 
     bool active;
     bool waiting_for_flip;
 
     bool vt_switcher_active;
     struct vt_switcher vt_switcher;
+
+    struct mpv_opengl_cb_drm_params drm_params;
 };
 
-static bool init_egl(struct MPGLContext *ctx, int flags)
+static bool init_egl(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    MP_VERBOSE(ctx->vo, "Initializing EGL\n");
+    MP_VERBOSE(ctx, "Initializing EGL\n");
     p->egl.display = eglGetDisplay(p->gbm.device);
     if (p->egl.display == EGL_NO_DISPLAY) {
-        MP_ERR(ctx->vo, "Failed to get EGL display.\n");
+        MP_ERR(ctx, "Failed to get EGL display.\n");
         return false;
     }
     if (!eglInitialize(p->egl.display, NULL, NULL)) {
-        MP_ERR(ctx->vo, "Failed to initialize EGL.\n");
+        MP_ERR(ctx, "Failed to initialize EGL.\n");
         return false;
     }
     EGLConfig config;
-    if (!mpegl_create_context(p->egl.display, ctx->vo->log, flags,
-                              &p->egl.context, &config))
-        return -1;
-    MP_VERBOSE(ctx->vo, "Initializing EGL surface\n");
+    if (!mpegl_create_context(ctx, p->egl.display, &p->egl.context, &config))
+        return false;
+    MP_VERBOSE(ctx, "Initializing EGL surface\n");
     p->egl.surface
         = eglCreateWindowSurface(p->egl.display, config, p->gbm.surface, NULL);
     if (p->egl.surface == EGL_NO_SURFACE) {
-        MP_ERR(ctx->vo, "Failed to create EGL surface.\n");
+        MP_ERR(ctx, "Failed to create EGL surface.\n");
         return false;
     }
     return true;
 }
 
-static bool init_gbm(struct MPGLContext *ctx)
+static bool init_gbm(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     MP_VERBOSE(ctx->vo, "Creating GBM device\n");
@@ -118,7 +125,7 @@ static bool init_gbm(struct MPGLContext *ctx)
         p->gbm.device,
         p->kms->mode.hdisplay,
         p->kms->mode.vdisplay,
-        GBM_BO_FORMAT_XRGB8888,
+        p->primary_plane_format, // drm_fourcc.h defs should be gbm-compatible
         GBM_BO_USE_SCANOUT | GBM_BO_USE_RENDERING);
     if (!p->gbm.surface) {
         MP_ERR(ctx->vo, "Failed to create GBM surface.\n");
@@ -135,46 +142,50 @@ static void framebuffer_destroy_callback(struct gbm_bo *bo, void *data)
     }
 }
 
-static void update_framebuffer_from_bo(
-    const struct MPGLContext *ctx, struct gbm_bo *bo)
+static void update_framebuffer_from_bo(struct ra_ctx *ctx, struct gbm_bo *bo)
 {
     struct priv *p = ctx->priv;
-    p->fb.bo = bo;
-    p->fb.fd = p->kms->fd;
-    p->fb.width = gbm_bo_get_width(bo);
-    p->fb.height = gbm_bo_get_height(bo);
-    int stride = gbm_bo_get_stride(bo);
-    int handle = gbm_bo_get_handle(bo).u32;
-
-    int ret = drmModeAddFB(p->kms->fd, p->fb.width, p->fb.height,
-                           24, 32, stride, handle, &p->fb.id);
+    struct framebuffer *fb = gbm_bo_get_user_data(bo);
+    if (fb) {
+        p->fb = fb;
+        return;
+    }
+
+    fb = talloc_zero(ctx, struct framebuffer);
+    fb->fd     = p->kms->fd;
+    fb->width  = gbm_bo_get_width(bo);
+    fb->height = gbm_bo_get_height(bo);
+    uint32_t stride = gbm_bo_get_stride(bo);
+    uint32_t handle = gbm_bo_get_handle(bo).u32;
+
+    int ret = drmModeAddFB2(fb->fd, fb->width, fb->height,
+                            p->primary_plane_format,
+                            (uint32_t[4]){handle, 0, 0, 0},
+                            (uint32_t[4]){stride, 0, 0, 0},
+                            (uint32_t[4]){0, 0, 0, 0},
+                            &fb->id, 0);
+
     if (ret) {
         MP_ERR(ctx->vo, "Failed to create framebuffer: %s\n", mp_strerror(errno));
     }
-    gbm_bo_set_user_data(bo, &p->fb, framebuffer_destroy_callback);
-}
-
-static void page_flipped(int fd, unsigned int frame, unsigned int sec,
-                         unsigned int usec, void *data)
-{
-    struct priv *p = data;
-    p->waiting_for_flip = false;
+    gbm_bo_set_user_data(bo, fb, framebuffer_destroy_callback);
+    p->fb = fb;
 }
 
-static bool crtc_setup(struct MPGLContext *ctx)
+static bool crtc_setup(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     if (p->active)
         return true;
     p->old_crtc = drmModeGetCrtc(p->kms->fd, p->kms->crtc_id);
-    int ret = drmModeSetCrtc(p->kms->fd, p->kms->crtc_id, p->fb.id,
+    int ret = drmModeSetCrtc(p->kms->fd, p->kms->crtc_id, p->fb->id,
                              0, 0, &p->kms->connector->connector_id, 1,
                              &p->kms->mode);
     p->active = true;
     return ret == 0;
 }
 
-static void crtc_release(struct MPGLContext *ctx)
+static void crtc_release(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -204,7 +215,7 @@ static void crtc_release(struct MPGLContext *ctx)
 
 static void release_vt(void *data)
 {
-    struct MPGLContext *ctx = data;
+    struct ra_ctx *ctx = data;
     MP_VERBOSE(ctx->vo, "Releasing VT");
     crtc_release(ctx);
     if (USE_MASTER) {
@@ -221,7 +232,7 @@ static void release_vt(void *data)
 
 static void acquire_vt(void *data)
 {
-    struct MPGLContext *ctx = data;
+    struct ra_ctx *ctx = data;
     MP_VERBOSE(ctx->vo, "Acquiring VT");
     if (USE_MASTER) {
         struct priv *p = ctx->priv;
@@ -234,11 +245,78 @@ static void acquire_vt(void *data)
     crtc_setup(ctx);
 }
 
-static void drm_egl_uninit(MPGLContext *ctx)
+static bool drm_atomic_egl_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
+{
+    struct priv *p = sw->ctx->priv;
+    if (p->kms->atomic_context) {
+        p->kms->atomic_context->request = drmModeAtomicAlloc();
+        p->drm_params.atomic_request = p->kms->atomic_context->request;
+        return ra_gl_ctx_start_frame(sw, out_fbo);
+    }
+    return false;
+}
+
+static const struct ra_swapchain_fns drm_atomic_swapchain = {
+    .start_frame   = drm_atomic_egl_start_frame,
+};
+
+static void drm_egl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    crtc_release(ctx);
+    struct drm_atomic_context *atomic_ctx = p->kms->atomic_context;
+    int ret;
+
+    eglSwapBuffers(p->egl.display, p->egl.surface);
+    p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
+    p->waiting_for_flip = true;
+    update_framebuffer_from_bo(ctx, p->gbm.next_bo);
+
+    if (atomic_ctx) {
+        drm_object_set_property(atomic_ctx->request, atomic_ctx->primary_plane, "FB_ID", p->fb->id);
+        drm_object_set_property(atomic_ctx->request, atomic_ctx->primary_plane, "CRTC_ID", atomic_ctx->crtc->id);
+        drm_object_set_property(atomic_ctx->request, atomic_ctx->primary_plane, "ZPOS", 1);
+
+        ret = drmModeAtomicCommit(p->kms->fd, atomic_ctx->request,
+                                  DRM_MODE_ATOMIC_NONBLOCK | DRM_MODE_PAGE_FLIP_EVENT, NULL);
+        if (ret)
+            MP_WARN(ctx->vo, "Failed to commit atomic request (%d)\n", ret);
+    } else {
+        ret = drmModePageFlip(p->kms->fd, p->kms->crtc_id, p->fb->id,
+                                  DRM_MODE_PAGE_FLIP_EVENT, p);
+        if (ret) {
+            MP_WARN(ctx->vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
+        }
+    }
+
+    // poll page flip finish event
+    const int timeout_ms = 3000;
+    struct pollfd fds[1] = { { .events = POLLIN, .fd = p->kms->fd } };
+    poll(fds, 1, timeout_ms);
+    if (fds[0].revents & POLLIN) {
+        ret = drmHandleEvent(p->kms->fd, &p->ev);
+        if (ret != 0) {
+            MP_ERR(ctx->vo, "drmHandleEvent failed: %i\n", ret);
+            p->waiting_for_flip = false;
+            return;
+        }
+    }
+    p->waiting_for_flip = false;
+
+    if (atomic_ctx) {
+        drmModeAtomicFree(atomic_ctx->request);
+        p->drm_params.atomic_request = atomic_ctx->request = NULL;
+    }
+
+    gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
+    p->gbm.bo = p->gbm.next_bo;
+}
+
+static void drm_egl_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
+    crtc_release(ctx);
     if (p->vt_switcher_active)
         vt_switcher_destroy(&p->vt_switcher);
 
@@ -258,100 +336,146 @@ static void drm_egl_uninit(MPGLContext *ctx)
     }
 }
 
-static int drm_egl_init(struct MPGLContext *ctx, int flags)
+// If primary plane supports ARGB8888 we want to use that, but if it doesn't we
+// fall back on XRGB8888. If the driver does not support atomic there is no
+// particular reason to be using ARGB8888, so we fall back to XRGB8888 (another
+// reason is that we do not have the convenient atomic_ctx and its convenient
+// primary_plane field).
+static bool probe_primary_plane_format(struct ra_ctx *ctx)
 {
-    if (ctx->vo->probing) {
-        MP_VERBOSE(ctx->vo, "DRM EGL backend can be activated only manually.\n");
-        return -1;
-    }
     struct priv *p = ctx->priv;
-    p->kms = NULL;
-    p->old_crtc = NULL;
-    p->gbm.surface = NULL;
-    p->gbm.device = NULL;
-    p->active = false;
-    p->waiting_for_flip = false;
+    if (!p->kms->atomic_context) {
+        p->primary_plane_format = DRM_FORMAT_XRGB8888;
+        MP_VERBOSE(ctx->vo, "Not using DRM Atomic: Use DRM_FORMAT_XRGB8888 for primary plane.\n");
+        return true;
+    }
+
+    drmModePlane *drmplane =
+        drmModeGetPlane(p->kms->fd, p->kms->atomic_context->primary_plane->id);
+    bool have_argb8888 = false;
+    bool have_xrgb8888 = false;
+    bool result = false;
+    for (unsigned int i = 0; i < drmplane->count_formats; ++i) {
+        if (drmplane->formats[i] == DRM_FORMAT_ARGB8888) {
+            have_argb8888 = true;
+        } else if (drmplane->formats[i] == DRM_FORMAT_XRGB8888) {
+            have_xrgb8888 = true;
+        }
+    }
+
+    if (have_argb8888) {
+        p->primary_plane_format = DRM_FORMAT_ARGB8888;
+        MP_VERBOSE(ctx->vo, "DRM_FORMAT_ARGB8888 supported by primary plane.\n");
+        result = true;
+    } else if (have_xrgb8888) {
+        p->primary_plane_format = DRM_FORMAT_XRGB8888;
+        MP_VERBOSE(ctx->vo,
+                   "DRM_FORMAT_ARGB8888 not supported by primary plane: "
+                   "Falling back to DRM_FORMAT_XRGB8888.\n");
+        result = true;
+    }
+
+    drmModeFreePlane(drmplane);
+    return result;
+}
+
+static bool drm_egl_init(struct ra_ctx *ctx)
+{
+    if (ctx->opts.probing) {
+        MP_VERBOSE(ctx, "DRM EGL backend can be activated only manually.\n");
+        return false;
+    }
+
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     p->ev.version = DRM_EVENT_CONTEXT_VERSION;
-    p->ev.page_flip_handler = page_flipped;
 
     p->vt_switcher_active = vt_switcher_init(&p->vt_switcher, ctx->vo->log);
     if (p->vt_switcher_active) {
         vt_switcher_acquire(&p->vt_switcher, acquire_vt, ctx);
         vt_switcher_release(&p->vt_switcher, release_vt, ctx);
     } else {
-        MP_WARN(ctx->vo, "Failed to set up VT switcher. Terminal switching will be unavailable.\n");
+        MP_WARN(ctx, "Failed to set up VT switcher. Terminal switching will be unavailable.\n");
     }
 
-    MP_VERBOSE(ctx->vo, "Initializing KMS\n");
-    p->kms = kms_create(ctx->vo->log, ctx->vo->opts->drm_connector_spec,
-                        ctx->vo->opts->drm_mode_id);
+    MP_VERBOSE(ctx, "Initializing KMS\n");
+    p->kms = kms_create(ctx->log, ctx->vo->opts->drm_opts->drm_connector_spec,
+                        ctx->vo->opts->drm_opts->drm_mode_id,
+                        ctx->vo->opts->drm_opts->drm_overlay_id);
     if (!p->kms) {
-        MP_ERR(ctx->vo, "Failed to create KMS.\n");
-        return -1;
+        MP_ERR(ctx, "Failed to create KMS.\n");
+        return false;
+    }
+
+    if (!probe_primary_plane_format(ctx)) {
+        MP_ERR(ctx->vo, "No suitable format found on DRM primary plane.\n");
+        return false;
     }
 
     if (!init_gbm(ctx)) {
         MP_ERR(ctx->vo, "Failed to setup GBM.\n");
-        return -1;
+        return false;
     }
 
-    if (!init_egl(ctx, flags)) {
+    if (!init_egl(ctx)) {
         MP_ERR(ctx->vo, "Failed to setup EGL.\n");
-        return -1;
+        return false;
     }
 
     if (!eglMakeCurrent(p->egl.display, p->egl.surface, p->egl.surface,
                         p->egl.context)) {
         MP_ERR(ctx->vo, "Failed to make context current.\n");
-        return -1;
+        return false;
     }
 
-    mpegl_load_functions(ctx->gl, ctx->vo->log);
-
-    ctx->native_display_type = "drm";
-    ctx->native_display = (void *)(intptr_t)p->kms->fd;
-
+    mpegl_load_functions(&p->gl, ctx->vo->log);
     // required by gbm_surface_lock_front_buffer
     eglSwapBuffers(p->egl.display, p->egl.surface);
 
-    MP_VERBOSE(ctx->vo, "Preparing framebuffer\n");
+    MP_VERBOSE(ctx, "Preparing framebuffer\n");
     p->gbm.bo = gbm_surface_lock_front_buffer(p->gbm.surface);
     if (!p->gbm.bo) {
-        MP_ERR(ctx->vo, "Failed to lock GBM surface.\n");
-        return -1;
+        MP_ERR(ctx, "Failed to lock GBM surface.\n");
+        return false;
     }
     update_framebuffer_from_bo(ctx, p->gbm.bo);
-    if (!p->fb.id) {
-        MP_ERR(ctx->vo, "Failed to create framebuffer.\n");
-        return -1;
+    if (!p->fb || !p->fb->id) {
+        MP_ERR(ctx, "Failed to create framebuffer.\n");
+        return false;
     }
 
     if (!crtc_setup(ctx)) {
-        MP_ERR(ctx->vo, "Failed to set CRTC for connector %u: %s\n",
+        MP_ERR(ctx, "Failed to set CRTC for connector %u: %s\n",
                p->kms->connector->connector_id, mp_strerror(errno));
-        return -1;
+        return false;
     }
 
-    return 0;
-}
+    p->drm_params.fd = p->kms->fd;
+    p->drm_params.crtc_id = p->kms->crtc_id;
+    if (p->kms->atomic_context)
+        p->drm_params.atomic_request = p->kms->atomic_context->request;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = drm_egl_swap_buffers,
+        .native_display_type = "opengl-cb-drm-params",
+        .native_display = &p->drm_params,
+        .external_swapchain = p->kms->atomic_context ? &drm_atomic_swapchain :
+                                                       NULL,
+    };
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        return false;
 
-static int drm_egl_init_deprecated(struct MPGLContext *ctx, int flags)
-{
-    if (ctx->vo->probing)
-        return -1;
-    MP_WARN(ctx->vo, "'drm-egl' is deprecated, use 'drm' instead.\n");
-    return drm_egl_init(ctx, flags);
+    return true;
 }
 
-static int drm_egl_reconfig(struct MPGLContext *ctx)
+static bool drm_egl_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    ctx->vo->dwidth = p->fb.width;
-    ctx->vo->dheight = p->fb.height;
-    return 0;
+    ctx->vo->dwidth  = p->fb->width;
+    ctx->vo->dheight = p->fb->height;
+    ra_gl_ctx_resize(ctx->swapchain, p->fb->width, p->fb->height, 0);
+    return true;
 }
 
-static int drm_egl_control(struct MPGLContext *ctx, int *events, int request,
+static int drm_egl_control(struct ra_ctx *ctx, int *events, int request,
                            void *arg)
 {
     struct priv *p = ctx->priv;
@@ -367,51 +491,11 @@ static int drm_egl_control(struct MPGLContext *ctx, int *events, int request,
     return VO_NOTIMPL;
 }
 
-static void drm_egl_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl.display, p->egl.surface);
-    p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
-    p->waiting_for_flip = true;
-    update_framebuffer_from_bo(ctx, p->gbm.next_bo);
-    int ret = drmModePageFlip(p->kms->fd, p->kms->crtc_id, p->fb.id,
-                              DRM_MODE_PAGE_FLIP_EVENT, p);
-    if (ret) {
-        MP_WARN(ctx->vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
-    }
-
-    // poll page flip finish event
-    const int timeout_ms = 3000;
-    struct pollfd fds[1] = { { .events = POLLIN, .fd = p->kms->fd } };
-    poll(fds, 1, timeout_ms);
-    if (fds[0].revents & POLLIN) {
-        ret = drmHandleEvent(p->kms->fd, &p->ev);
-        if (ret != 0) {
-            MP_ERR(ctx->vo, "drmHandleEvent failed: %i\n", ret);
-            return;
-        }
-    }
-
-    gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
-    p->gbm.bo = p->gbm.next_bo;
-}
-
-const struct mpgl_driver mpgl_driver_drm = {
+const struct ra_ctx_fns ra_ctx_drm_egl = {
+    .type           = "opengl",
     .name           = "drm",
-    .priv_size      = sizeof(struct priv),
-    .init           = drm_egl_init,
-    .reconfig       = drm_egl_reconfig,
-    .swap_buffers   = drm_egl_swap_buffers,
-    .control        = drm_egl_control,
-    .uninit         = drm_egl_uninit,
-};
-
-const struct mpgl_driver mpgl_driver_drm_egl = {
-    .name           = "drm-egl",
-    .priv_size      = sizeof(struct priv),
-    .init           = drm_egl_init_deprecated,
     .reconfig       = drm_egl_reconfig,
-    .swap_buffers   = drm_egl_swap_buffers,
     .control        = drm_egl_control,
+    .init           = drm_egl_init,
     .uninit         = drm_egl_uninit,
 };
diff --git a/video/out/opengl/context_dxinterop.c b/video/out/opengl/context_dxinterop.c
index 507c150..85d84bf 100644
--- a/video/out/opengl/context_dxinterop.c
+++ b/video/out/opengl/context_dxinterop.c
@@ -22,6 +22,7 @@
 #include "osdep/windows_utils.h"
 #include "video/out/w32_common.h"
 #include "context.h"
+#include "utils.h"
 
 // For WGL_ACCESS_WRITE_DISCARD_NV, etc.
 #include <GL/wglext.h>
@@ -35,6 +36,8 @@ EXTERN_C IMAGE_DOS_HEADER __ImageBase;
 #endif
 
 struct priv {
+    GL gl;
+
     HMODULE d3d9_dll;
     HRESULT (WINAPI *Direct3DCreate9Ex)(UINT SDKVersion, IDirect3D9Ex **ppD3D);
 
@@ -54,6 +57,7 @@ struct priv {
 
     // OpenGL resources
     GLuint texture;
+    GLuint main_fb;
 
     // Did we lose the device?
     bool lost_device;
@@ -63,7 +67,7 @@ struct priv {
     int width, height, swapinterval;
 };
 
-static __thread struct MPGLContext *current_ctx;
+static __thread struct ra_ctx *current_ctx;
 
 static void pump_message_loop(void)
 {
@@ -84,10 +88,11 @@ static void *w32gpa(const GLubyte *procName)
     return GetProcAddress(oglmod, procName);
 }
 
-static int os_ctx_create(struct MPGLContext *ctx)
+static int os_ctx_create(struct ra_ctx *ctx)
 {
     static const wchar_t os_wnd_class[] = L"mpv offscreen gl";
     struct priv *p = ctx->priv;
+    GL *gl = &p->gl;
     HGLRC legacy_context = NULL;
 
     RegisterClassExW(&(WNDCLASSEXW) {
@@ -190,8 +195,8 @@ static int os_ctx_create(struct MPGLContext *ctx)
         goto fail;
     }
 
-    mpgl_load_functions(ctx->gl, w32gpa, wgl_exts, ctx->vo->log);
-    if (!(ctx->gl->mpgl_caps & MPGL_CAP_DXINTEROP)) {
+    mpgl_load_functions(gl, w32gpa, wgl_exts, ctx->vo->log);
+    if (!(gl->mpgl_caps & MPGL_CAP_DXINTEROP)) {
         MP_FATAL(ctx->vo, "WGL_NV_DX_interop is not supported\n");
         goto fail;
     }
@@ -205,7 +210,7 @@ fail:
     return -1;
 }
 
-static void os_ctx_destroy(MPGLContext *ctx)
+static void os_ctx_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -219,10 +224,10 @@ static void os_ctx_destroy(MPGLContext *ctx)
         DestroyWindow(p->os_wnd);
 }
 
-static int d3d_size_dependent_create(MPGLContext *ctx)
+static int d3d_size_dependent_create(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
+    GL *gl = &p->gl;
     HRESULT hr;
 
     IDirect3DSwapChain9 *sw9;
@@ -294,7 +299,7 @@ static int d3d_size_dependent_create(MPGLContext *ctx)
         return -1;
     }
 
-    gl->BindFramebuffer(GL_FRAMEBUFFER, ctx->main_fb);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, p->main_fb);
     gl->FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0,
         GL_TEXTURE_2D, p->texture, 0);
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
@@ -302,10 +307,10 @@ static int d3d_size_dependent_create(MPGLContext *ctx)
     return 0;
 }
 
-static void d3d_size_dependent_destroy(MPGLContext *ctx)
+static void d3d_size_dependent_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     if (p->rtarget_h) {
         gl->DXUnlockObjectsNV(p->device_h, 1, &p->rtarget_h);
@@ -321,7 +326,8 @@ static void d3d_size_dependent_destroy(MPGLContext *ctx)
     SAFE_RELEASE(p->swapchain);
 }
 
-static void fill_presentparams(MPGLContext *ctx, D3DPRESENT_PARAMETERS *pparams)
+static void fill_presentparams(struct ra_ctx *ctx,
+                               D3DPRESENT_PARAMETERS *pparams)
 {
     struct priv *p = ctx->priv;
 
@@ -338,13 +344,9 @@ static void fill_presentparams(MPGLContext *ctx, D3DPRESENT_PARAMETERS *pparams)
         .Windowed = TRUE,
         .BackBufferWidth = ctx->vo->dwidth ? ctx->vo->dwidth : 1,
         .BackBufferHeight = ctx->vo->dheight ? ctx->vo->dheight : 1,
-        // The length of the backbuffer queue shouldn't affect latency because
-        // swap_buffers() always uses the backbuffer at the head of the queue
-        // and presents it immediately. MSDN says there is a performance
-        // penalty for having a short backbuffer queue and this seems to be
-        // true, at least on Nvidia, where less than four backbuffers causes
-        // very high CPU usage. Use six to be safe.
-        .BackBufferCount = 6,
+        // Add one frame for the backbuffer and one frame of "slack" to reduce
+        // contention with the window manager when acquiring the backbuffer
+        .BackBufferCount = ctx->opts.swapchain_depth + 2,
         .SwapEffect = IsWindows7OrGreater() ? D3DSWAPEFFECT_FLIPEX : D3DSWAPEFFECT_FLIP,
         // Automatically get the backbuffer format from the display format
         .BackBufferFormat = D3DFMT_UNKNOWN,
@@ -353,10 +355,10 @@ static void fill_presentparams(MPGLContext *ctx, D3DPRESENT_PARAMETERS *pparams)
     };
 }
 
-static int d3d_create(MPGLContext *ctx)
+static int d3d_create(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
+    GL *gl = &p->gl;
     HRESULT hr;
 
     p->d3d9_dll = LoadLibraryW(L"d3d9.dll");
@@ -396,8 +398,7 @@ static int d3d_create(MPGLContext *ctx)
         return -1;
     }
 
-    // mpv expects frames to be presented right after swap_buffers() returns
-    IDirect3DDevice9Ex_SetMaximumFrameLatency(p->device, 1);
+    IDirect3DDevice9Ex_SetMaximumFrameLatency(p->device, ctx->opts.swapchain_depth);
 
     // Register the Direct3D device with WGL_NV_dx_interop
     p->device_h = gl->DXOpenDeviceNV(p->device);
@@ -410,10 +411,10 @@ static int d3d_create(MPGLContext *ctx)
     return 0;
 }
 
-static void d3d_destroy(MPGLContext *ctx)
+static void d3d_destroy(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     if (p->device_h)
         gl->DXCloseDeviceNV(p->device_h);
@@ -423,8 +424,9 @@ static void d3d_destroy(MPGLContext *ctx)
         FreeLibrary(p->d3d9_dll);
 }
 
-static void dxinterop_uninit(MPGLContext *ctx)
+static void dxgl_uninit(struct ra_ctx *ctx)
 {
+    ra_gl_ctx_uninit(ctx);
     d3d_size_dependent_destroy(ctx);
     d3d_destroy(ctx);
     os_ctx_destroy(ctx);
@@ -433,7 +435,7 @@ static void dxinterop_uninit(MPGLContext *ctx)
     pump_message_loop();
 }
 
-static void dxinterop_reset(struct MPGLContext *ctx)
+static void dxgl_reset(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     HRESULT hr;
@@ -468,18 +470,18 @@ static void dxinterop_reset(struct MPGLContext *ctx)
     p->lost_device = false;
 }
 
-static int GLAPIENTRY dxinterop_swap_interval(int interval)
+static int GLAPIENTRY dxgl_swap_interval(int interval)
 {
     if (!current_ctx)
         return 0;
     struct priv *p = current_ctx->priv;
 
     p->requested_swapinterval = interval;
-    dxinterop_reset(current_ctx);
+    dxgl_reset(current_ctx);
     return 1;
 }
 
-static void * GLAPIENTRY dxinterop_get_native_display(const char *name)
+static void * GLAPIENTRY dxgl_get_native_display(const char *name)
 {
     if (!current_ctx || !name)
         return NULL;
@@ -493,60 +495,17 @@ static void * GLAPIENTRY dxinterop_get_native_display(const char *name)
     return NULL;
 }
 
-static int dxinterop_init(struct MPGLContext *ctx, int flags)
-{
-    struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
-
-    p->requested_swapinterval = 1;
-
-    if (!vo_w32_init(ctx->vo))
-        goto fail;
-    if (os_ctx_create(ctx) < 0)
-        goto fail;
-
-    // Create the shared framebuffer
-    gl->GenFramebuffers(1, &ctx->main_fb);
-
-    current_ctx = ctx;
-    gl->SwapInterval = dxinterop_swap_interval;
-    gl->MPGetNativeDisplay = dxinterop_get_native_display;
-
-    if (d3d_create(ctx) < 0)
-        goto fail;
-    if (d3d_size_dependent_create(ctx) < 0)
-        goto fail;
-
-    // The OpenGL and Direct3D coordinate systems are flipped vertically
-    // relative to each other. Flip the video during rendering so it can be
-    // copied to the Direct3D backbuffer with a simple (and fast) StretchRect.
-    ctx->flip_v = true;
-
-    DwmEnableMMCSS(TRUE);
-
-    return 0;
-fail:
-    dxinterop_uninit(ctx);
-    return -1;
-}
-
-static int dxinterop_reconfig(struct MPGLContext *ctx)
-{
-    vo_w32_config(ctx->vo);
-    return 0;
-}
-
-static void dxinterop_swap_buffers(MPGLContext *ctx)
+static void dxgl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    struct GL *gl = ctx->gl;
+    GL *gl = &p->gl;
     HRESULT hr;
 
     pump_message_loop();
 
     // If the device is still lost, try to reset it again
     if (p->lost_device)
-        dxinterop_reset(ctx);
+        dxgl_reset(ctx);
     if (p->lost_device)
         return;
 
@@ -571,7 +530,7 @@ static void dxinterop_swap_buffers(MPGLContext *ctx)
     case D3DERR_DEVICEHUNG:
         MP_VERBOSE(ctx->vo, "Direct3D device lost! Resetting.\n");
         p->lost_device = true;
-        dxinterop_reset(ctx);
+        dxgl_reset(ctx);
         return;
     default:
         if (FAILED(hr))
@@ -584,21 +543,75 @@ static void dxinterop_swap_buffers(MPGLContext *ctx)
     }
 }
 
-static int dxinterop_control(MPGLContext *ctx, int *events, int request,
+static bool dxgl_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    GL *gl = &p->gl;
+
+    p->requested_swapinterval = 1;
+
+    if (!vo_w32_init(ctx->vo))
+        goto fail;
+    if (os_ctx_create(ctx) < 0)
+        goto fail;
+
+    // Create the shared framebuffer
+    gl->GenFramebuffers(1, &p->main_fb);
+
+    current_ctx = ctx;
+    gl->SwapInterval = dxgl_swap_interval;
+    gl->MPGetNativeDisplay = dxgl_get_native_display;
+
+    if (d3d_create(ctx) < 0)
+        goto fail;
+    if (d3d_size_dependent_create(ctx) < 0)
+        goto fail;
+
+    static const struct ra_swapchain_fns empty_swapchain_fns = {0};
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = dxgl_swap_buffers,
+        .flipped = true,
+        .external_swapchain = &empty_swapchain_fns,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto fail;
+
+    DwmEnableMMCSS(TRUE);
+    return true;
+fail:
+    dxgl_uninit(ctx);
+    return false;
+}
+
+static void resize(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    dxgl_reset(ctx);
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, p->main_fb);
+}
+
+static bool dxgl_reconfig(struct ra_ctx *ctx)
+{
+    vo_w32_config(ctx->vo);
+    resize(ctx);
+    return true;
+}
+
+static int dxgl_control(struct ra_ctx *ctx, int *events, int request,
                              void *arg)
 {
-    int r = vo_w32_control(ctx->vo, events, request, arg);
+    int ret = vo_w32_control(ctx->vo, events, request, arg);
     if (*events & VO_EVENT_RESIZE)
-        dxinterop_reset(ctx);
-    return r;
+        resize(ctx);
+    return ret;
 }
 
-const struct mpgl_driver mpgl_driver_dxinterop = {
+const struct ra_ctx_fns ra_ctx_dxgl = {
+    .type         = "opengl",
     .name         = "dxinterop",
-    .priv_size    = sizeof(struct priv),
-    .init         = dxinterop_init,
-    .reconfig     = dxinterop_reconfig,
-    .swap_buffers = dxinterop_swap_buffers,
-    .control      = dxinterop_control,
-    .uninit       = dxinterop_uninit,
+    .init         = dxgl_init,
+    .reconfig     = dxgl_reconfig,
+    .control      = dxgl_control,
+    .uninit       = dxgl_uninit,
 };
diff --git a/video/out/opengl/context_x11.c b/video/out/opengl/context_glx.c
index 4d8dac1..462f2cf 100644
--- a/video/out/opengl/context_x11.c
+++ b/video/out/opengl/context_glx.c
@@ -39,43 +39,46 @@
 
 #include "video/out/x11_common.h"
 #include "context.h"
+#include "utils.h"
 
-struct glx_context {
+struct priv {
+    GL gl;
     XVisualInfo *vinfo;
     GLXContext context;
     GLXFBConfig fbc;
 };
 
-static void glx_uninit(MPGLContext *ctx)
+static void glx_uninit(struct ra_ctx *ctx)
 {
-    struct glx_context *glx_ctx = ctx->priv;
-    if (glx_ctx->vinfo)
-        XFree(glx_ctx->vinfo);
-    if (glx_ctx->context) {
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
+    if (p->vinfo)
+        XFree(p->vinfo);
+    if (p->context) {
         Display *display = ctx->vo->x11->display;
         glXMakeCurrent(display, None, NULL);
-        glXDestroyContext(display, glx_ctx->context);
+        glXDestroyContext(display, p->context);
     }
+
     vo_x11_uninit(ctx->vo);
 }
 
-static bool create_context_x11_old(struct MPGLContext *ctx)
+static bool create_context_x11_old(struct ra_ctx *ctx, GL *gl)
 {
-    struct glx_context *glx_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
     Display *display = ctx->vo->x11->display;
     struct vo *vo = ctx->vo;
-    GL *gl = ctx->gl;
 
-    if (glx_ctx->context)
+    if (p->context)
         return true;
 
-    if (!glx_ctx->vinfo) {
+    if (!p->vinfo) {
         MP_FATAL(vo, "Can't create a legacy GLX context without X visual\n");
         return false;
     }
 
-    GLXContext new_context = glXCreateContext(display, glx_ctx->vinfo, NULL,
-                                              True);
+    GLXContext new_context = glXCreateContext(display, p->vinfo, NULL, True);
     if (!new_context) {
         MP_FATAL(vo, "Could not create GLX context!\n");
         return false;
@@ -91,7 +94,7 @@ static bool create_context_x11_old(struct MPGLContext *ctx)
 
     mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
 
-    glx_ctx->context = new_context;
+    p->context = new_context;
 
     return true;
 }
@@ -99,15 +102,18 @@ static bool create_context_x11_old(struct MPGLContext *ctx)
 typedef GLXContext (*glXCreateContextAttribsARBProc)
     (Display*, GLXFBConfig, GLXContext, Bool, const int*);
 
-static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
-                                   int gl_version, bool es)
+static bool create_context_x11_gl3(struct ra_ctx *ctx, GL *gl, int gl_version,
+                                   bool es)
 {
-    struct glx_context *glx_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
-    if (glx_ctx->context)
+    if (p->context)
         return true;
 
+    if (!ra_gl_ctx_test_version(ctx, gl_version, es))
+        return false;
+
     glXCreateContextAttribsARBProc glXCreateContextAttribsARB =
         (glXCreateContextAttribsARBProc)
             glXGetProcAddressARB((const GLubyte *)"glXCreateContextAttribsARB");
@@ -120,7 +126,7 @@ static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
         return false;
     }
 
-    int ctx_flags = vo_flags & VOFLAG_GL_DEBUG ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
+    int ctx_flags = ctx->opts.debug ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
     int profile_mask = GLX_CONTEXT_CORE_PROFILE_BIT_ARB;
 
     if (es) {
@@ -138,7 +144,7 @@ static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
     };
     vo_x11_silence_xlib(1);
     GLXContext context = glXCreateContextAttribsARB(vo->x11->display,
-                                                    glx_ctx->fbc, 0, True,
+                                                    p->fbc, 0, True,
                                                     context_attribs);
     vo_x11_silence_xlib(-1);
     if (!context)
@@ -151,9 +157,9 @@ static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
         return false;
     }
 
-    glx_ctx->context = context;
+    p->context = context;
 
-    mpgl_load_functions(ctx->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+    mpgl_load_functions(gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
 
     return true;
 }
@@ -162,7 +168,7 @@ static bool create_context_x11_gl3(struct MPGLContext *ctx, int vo_flags,
 //  http://www.opengl.org/wiki/Tutorial:_OpenGL_3.0_Context_Creation_(GLX)
 // but also uses some of the old code.
 
-static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, int flags)
+static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, bool alpha)
 {
     int fbcount;
     GLXFBConfig *fbc = glXChooseFBConfig(vo->x11->display, vo->x11->screen,
@@ -173,7 +179,7 @@ static GLXFBConfig select_fb_config(struct vo *vo, const int *attribs, int flags
     // The list in fbc is sorted (so that the first element is the best).
     GLXFBConfig fbconfig = fbcount > 0 ? fbc[0] : NULL;
 
-    if (flags & VOFLAG_ALPHA) {
+    if (alpha) {
         for (int n = 0; n < fbcount; n++) {
             XVisualInfo *v = glXGetVisualFromFBConfig(vo->x11->display, fbc[n]);
             if (v) {
@@ -202,10 +208,16 @@ static void set_glx_attrib(int *attribs, int name, int value)
     }
 }
 
-static int glx_init(struct MPGLContext *ctx, int flags)
+static void glx_swap_buffers(struct ra_ctx *ctx)
+{
+    glXSwapBuffers(ctx->vo->x11->display, ctx->vo->x11->window);
+}
+
+static bool glx_init(struct ra_ctx *ctx)
 {
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     struct vo *vo = ctx->vo;
-    struct glx_context *glx_ctx = ctx->priv;
+    GL *gl = &p->gl;
 
     if (!vo_x11_init(ctx->vo))
         goto uninit;
@@ -213,12 +225,12 @@ static int glx_init(struct MPGLContext *ctx, int flags)
     int glx_major, glx_minor;
 
     if (!glXQueryVersion(vo->x11->display, &glx_major, &glx_minor)) {
-        MP_ERR(vo, "GLX not found.\n");
+        MP_ERR(ctx, "GLX not found.\n");
         goto uninit;
     }
     // FBConfigs were added in GLX version 1.3.
     if (MPGL_VER(glx_major, glx_minor) <  MPGL_VER(1, 3)) {
-        MP_ERR(vo, "GLX version older than 1.3.\n");
+        MP_ERR(ctx, "GLX version older than 1.3.\n");
         goto uninit;
     }
 
@@ -233,126 +245,132 @@ static int glx_init(struct MPGLContext *ctx, int flags)
         None
     };
     GLXFBConfig fbc = NULL;
-    if (flags & VOFLAG_ALPHA) {
+    if (ctx->opts.want_alpha) {
         set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 1);
-        fbc = select_fb_config(vo, glx_attribs, flags);
-        if (!fbc) {
+        fbc = select_fb_config(vo, glx_attribs, true);
+        if (!fbc)
             set_glx_attrib(glx_attribs, GLX_ALPHA_SIZE, 0);
-            flags &= ~VOFLAG_ALPHA;
-        }
     }
     if (!fbc)
-        fbc = select_fb_config(vo, glx_attribs, flags);
+        fbc = select_fb_config(vo, glx_attribs, false);
     if (!fbc) {
-        MP_ERR(vo, "no GLX support present\n");
+        MP_ERR(ctx, "no GLX support present\n");
         goto uninit;
     }
 
     int fbid = -1;
     if (!glXGetFBConfigAttrib(vo->x11->display, fbc, GLX_FBCONFIG_ID, &fbid))
-        MP_VERBOSE(vo, "GLX chose FB config with ID 0x%x\n", fbid);
+        MP_VERBOSE(ctx, "GLX chose FB config with ID 0x%x\n", fbid);
 
-    glx_ctx->fbc = fbc;
-    glx_ctx->vinfo = glXGetVisualFromFBConfig(vo->x11->display, fbc);
-    if (glx_ctx->vinfo) {
-        MP_VERBOSE(vo, "GLX chose visual with ID 0x%x\n",
-                   (int)glx_ctx->vinfo->visualid);
+    p->fbc = fbc;
+    p->vinfo = glXGetVisualFromFBConfig(vo->x11->display, fbc);
+    if (p->vinfo) {
+        MP_VERBOSE(ctx, "GLX chose visual with ID 0x%x\n",
+                   (int)p->vinfo->visualid);
     } else {
-        MP_WARN(vo, "Selected GLX FB config has no associated X visual\n");
+        MP_WARN(ctx, "Selected GLX FB config has no associated X visual\n");
     }
 
-    if (!vo_x11_create_vo_window(vo, glx_ctx->vinfo, "gl"))
+    if (!vo_x11_create_vo_window(vo, p->vinfo, "gl"))
         goto uninit;
 
     bool success = false;
-    if (!(flags & VOFLAG_GLES)) {
-        for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
-            int version = mpgl_preferred_gl_versions[n];
-            MP_VERBOSE(vo, "Creating OpenGL %d.%d context...\n",
-                       MPGL_VER_P(version));
-            if (version >= 300) {
-                success = create_context_x11_gl3(ctx, flags, version, false);
-            } else {
-                success = create_context_x11_old(ctx);
-            }
-            if (success)
-                break;
+    for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
+        int version = mpgl_preferred_gl_versions[n];
+        MP_VERBOSE(ctx, "Creating OpenGL %d.%d context...\n",
+                   MPGL_VER_P(version));
+        if (version >= 300) {
+            success = create_context_x11_gl3(ctx, gl, version, false);
+        } else {
+            success = create_context_x11_old(ctx, gl);
         }
+        if (success)
+            break;
     }
-    if (!success) // try ES
-        success = create_context_x11_gl3(ctx, flags, 200, true);
-    if (success && !glXIsDirect(vo->x11->display, glx_ctx->context))
-        ctx->gl->mpgl_caps |= MPGL_CAP_SW;
+    if (!success) // try again for GLES
+        success = create_context_x11_gl3(ctx, gl, 200, true);
+    if (success && !glXIsDirect(vo->x11->display, p->context))
+        gl->mpgl_caps |= MPGL_CAP_SW;
     if (!success)
         goto uninit;
 
-    return 0;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = glx_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
     glx_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int glx_init_probe(struct MPGLContext *ctx, int flags)
+static bool glx_init_probe(struct ra_ctx *ctx)
 {
-    int r = glx_init(ctx, flags);
-    if (r >= 0) {
-        if (!(ctx->gl->mpgl_caps & MPGL_CAP_VDPAU)) {
-            MP_VERBOSE(ctx->vo, "No vdpau support found - probing more things.\n");
-            glx_uninit(ctx);
-            r = -1;
-        }
+    if (!glx_init(ctx))
+        return false;
+
+    struct priv *p = ctx->priv;
+    if (!(p->gl.mpgl_caps & MPGL_CAP_VDPAU)) {
+        MP_VERBOSE(ctx, "No vdpau support found - probing more things.\n");
+        glx_uninit(ctx);
+        return false;
     }
-    return r;
+
+    return true;
 }
 
-static int glx_reconfig(struct MPGLContext *ctx)
+static void resize(struct ra_ctx *ctx)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
 }
 
-static int glx_control(struct MPGLContext *ctx, int *events, int request,
-                       void *arg)
+static bool glx_reconfig(struct ra_ctx *ctx)
 {
-    return vo_x11_control(ctx->vo, events, request, arg);
+    vo_x11_config_vo_window(ctx->vo);
+    resize(ctx);
+    return true;
 }
 
-static void glx_swap_buffers(struct MPGLContext *ctx)
+static int glx_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
-    glXSwapBuffers(ctx->vo->x11->display, ctx->vo->x11->window);
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-static void glx_wakeup(struct MPGLContext *ctx)
+static void glx_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void glx_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void glx_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_x11 = {
+const struct ra_ctx_fns ra_ctx_glx = {
+    .type           = "opengl",
     .name           = "x11",
-    .priv_size      = sizeof(struct glx_context),
-    .init           = glx_init,
     .reconfig       = glx_reconfig,
-    .swap_buffers   = glx_swap_buffers,
     .control        = glx_control,
     .wakeup         = glx_wakeup,
     .wait_events    = glx_wait_events,
+    .init           = glx_init,
     .uninit         = glx_uninit,
 };
 
-const struct mpgl_driver mpgl_driver_x11_probe = {
+const struct ra_ctx_fns ra_ctx_glx_probe = {
+    .type           = "opengl",
     .name           = "x11probe",
-    .priv_size      = sizeof(struct glx_context),
-    .init           = glx_init_probe,
     .reconfig       = glx_reconfig,
-    .swap_buffers   = glx_swap_buffers,
     .control        = glx_control,
     .wakeup         = glx_wakeup,
     .wait_events    = glx_wait_events,
+    .init           = glx_init_probe,
     .uninit         = glx_uninit,
 };
diff --git a/video/out/opengl/context_mali_fbdev.c b/video/out/opengl/context_mali_fbdev.c
index 66daa7f..8576e53 100644
--- a/video/out/opengl/context_mali_fbdev.c
+++ b/video/out/opengl/context_mali_fbdev.c
@@ -50,8 +50,7 @@ static bool get_fbdev_size(int *w, int *h)
 }
 
 struct priv {
-    struct mp_log *log;
-    struct GL *gl;
+    struct GL gl;
     EGLDisplay egl_display;
     EGLConfig egl_config;
     EGLContext egl_context;
@@ -60,9 +59,10 @@ struct priv {
     int w, h;
 };
 
-static void mali_uninit(struct MPGLContext *ctx)
+static void mali_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     if (p->egl_surface) {
         eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
@@ -74,25 +74,29 @@ static void mali_uninit(struct MPGLContext *ctx)
     eglReleaseThread();
 }
 
-static int mali_init(struct MPGLContext *ctx, int flags)
+static void mali_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    p->log = ctx->vo->log;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool mali_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     if (!get_fbdev_size(&p->w, &p->h)) {
-        MP_FATAL(p, "Could not get fbdev size.\n");
+        MP_FATAL(ctx, "Could not get fbdev size.\n");
         goto fail;
     }
 
     p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        MP_FATAL(p, "EGL failed to initialize.\n");
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
         goto fail;
     }
 
     EGLConfig config;
-    if (!mpegl_create_context(p->egl_display, p->log, flags, &p->egl_context,
-                              &config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &config))
         goto fail;
 
     p->egl_window = (struct fbdev_window){
@@ -104,53 +108,51 @@ static int mali_init(struct MPGLContext *ctx, int flags)
                                     (EGLNativeWindowType)&p->egl_window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(p, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto fail;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(p, "Failed to set context!\n");
+        MP_FATAL(ctx, "Failed to set context!\n");
         goto fail;
     }
 
-    ctx->gl = talloc_zero(ctx, GL);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    mpegl_load_functions(ctx->gl, p->log);
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = mali_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
 
-    return 0;
+    return true;
 
 fail:
     mali_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int mali_reconfig(struct MPGLContext *ctx)
+static bool mali_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 }
 
-static void mali_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
-}
-
-static int mali_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int mali_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
     return VO_NOTIMPL;
 }
 
-const struct mpgl_driver mpgl_driver_mali = {
+const struct ra_ctx_fns ra_ctx_mali_fbdev = {
+    .type           = "opengl",
     .name           = "mali-fbdev",
-    .priv_size      = sizeof(struct priv),
-    .init           = mali_init,
     .reconfig       = mali_reconfig,
-    .swap_buffers   = mali_swap_buffers,
     .control        = mali_control,
+    .init           = mali_init,
     .uninit         = mali_uninit,
 };
diff --git a/video/out/opengl/context_rpi.c b/video/out/opengl/context_rpi.c
index e79622b..8b447d0 100644
--- a/video/out/opengl/context_rpi.c
+++ b/video/out/opengl/context_rpi.c
@@ -30,7 +30,7 @@
 #include "egl_helpers.h"
 
 struct priv {
-    struct mp_log *log;
+    struct GL gl;
     DISPMANX_DISPLAY_HANDLE_T display;
     DISPMANX_ELEMENT_HANDLE_T window;
     DISPMANX_UPDATE_HANDLE_T update;
@@ -49,13 +49,13 @@ struct priv {
 static void tv_callback(void *callback_data, uint32_t reason, uint32_t param1,
                         uint32_t param2)
 {
-    struct MPGLContext *ctx = callback_data;
+    struct ra_ctx *ctx = callback_data;
     struct priv *p = ctx->priv;
     atomic_store(&p->reload_display, true);
     vo_wakeup(ctx->vo);
 }
 
-static void destroy_dispmanx(struct MPGLContext *ctx)
+static void destroy_dispmanx(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -77,9 +77,10 @@ static void destroy_dispmanx(struct MPGLContext *ctx)
     p->update = 0;
 }
 
-static void rpi_uninit(MPGLContext *ctx)
+static void rpi_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     vc_tv_unregister_callback_full(tv_callback, ctx);
 
@@ -92,26 +93,26 @@ static void rpi_uninit(MPGLContext *ctx)
     p->egl_display = EGL_NO_DISPLAY;
 }
 
-static int recreate_dispmanx(struct MPGLContext *ctx)
+static bool recreate_dispmanx(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     int display_nr = 0;
     int layer = 0;
 
-    MP_VERBOSE(ctx->vo, "Recreating DISPMANX state...\n");
+    MP_VERBOSE(ctx, "Recreating DISPMANX state...\n");
 
     destroy_dispmanx(ctx);
 
     p->display = vc_dispmanx_display_open(display_nr);
     p->update = vc_dispmanx_update_start(0);
     if (!p->display || !p->update) {
-        MP_FATAL(ctx->vo, "Could not get DISPMANX objects.\n");
+        MP_FATAL(ctx, "Could not get DISPMANX objects.\n");
         goto fail;
     }
 
     uint32_t dispw, disph;
     if (graphics_get_display_size(0, &dispw, &disph) < 0) {
-        MP_FATAL(ctx->vo, "Could not get display size.\n");
+        MP_FATAL(ctx, "Could not get display size.\n");
         goto fail;
     }
     p->w = dispw;
@@ -145,7 +146,7 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
                                         &src, DISPMANX_PROTECTION_NONE, &alpha,
                                         0, 0);
     if (!p->window) {
-        MP_FATAL(ctx->vo, "Could not add DISPMANX element.\n");
+        MP_FATAL(ctx, "Could not add DISPMANX element.\n");
         goto fail;
     }
 
@@ -161,14 +162,14 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
                                             &p->egl_window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(p, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto fail;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(p, "Failed to set context!\n");
+        MP_FATAL(ctx, "Failed to set context!\n");
         goto fail;
     }
 
@@ -197,21 +198,27 @@ static int recreate_dispmanx(struct MPGLContext *ctx)
 
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
+    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 
     ctx->vo->want_redraw = true;
 
     vo_event(ctx->vo, VO_EVENT_WIN_STATE);
-    return 0;
+    return true;
 
 fail:
     destroy_dispmanx(ctx);
-    return -1;
+    return false;
 }
 
-static int rpi_init(struct MPGLContext *ctx, int flags)
+static void rpi_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    p->log = ctx->vo->log;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool rpi_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     bcm_host_init();
 
@@ -219,43 +226,40 @@ static int rpi_init(struct MPGLContext *ctx, int flags)
 
     p->egl_display = eglGetDisplay(EGL_DEFAULT_DISPLAY);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        MP_FATAL(p, "EGL failed to initialize.\n");
+        MP_FATAL(ctx, "EGL failed to initialize.\n");
         goto fail;
     }
 
-    if (!mpegl_create_context(p->egl_display, p->log, 0, &p->egl_context,
-                              &p->egl_config))
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context, &p->egl_config))
         goto fail;
 
     if (recreate_dispmanx(ctx) < 0)
         goto fail;
 
-    ctx->gl = talloc_zero(ctx, GL);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    mpegl_load_functions(ctx->gl, p->log);
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = rpi_swap_buffers,
+        .native_display_type = "MPV_RPI_WINDOW",
+        .native_display = p->win_params,
+    };
 
-    ctx->native_display_type = "MPV_RPI_WINDOW";
-    ctx->native_display = p->win_params;
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto fail;
 
-    return 0;
+    return true;
 
 fail:
     rpi_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int rpi_reconfig(struct MPGLContext *ctx)
+static bool rpi_reconfig(struct ra_ctx *ctx)
 {
     return recreate_dispmanx(ctx);
 }
 
-static void rpi_swap_buffers(MPGLContext *ctx)
-{
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
-}
-
-static struct mp_image *take_screenshot(struct MPGLContext *ctx)
+static struct mp_image *take_screenshot(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
 
@@ -289,21 +293,20 @@ fail:
     return NULL;
 }
 
-
-static int rpi_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int rpi_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
     struct priv *p = ctx->priv;
 
     switch (request) {
     case VOCTRL_SCREENSHOT_WIN:
         *(struct mp_image **)arg = take_screenshot(ctx);
-        return true;
+        return VO_TRUE;
     case VOCTRL_FULLSCREEN:
         recreate_dispmanx(ctx);
         return VO_TRUE;
     case VOCTRL_CHECK_EVENTS:
         if (atomic_fetch_and(&p->reload_display, 0)) {
-            MP_WARN(ctx->vo, "Recovering from display mode switch...\n");
+            MP_WARN(ctx, "Recovering from display mode switch...\n");
             recreate_dispmanx(ctx);
         }
         return VO_TRUE;
@@ -315,12 +318,11 @@ static int rpi_control(MPGLContext *ctx, int *events, int request, void *arg)
     return VO_NOTIMPL;
 }
 
-const struct mpgl_driver mpgl_driver_rpi = {
+const struct ra_ctx_fns ra_ctx_rpi = {
+    .type           = "opengl",
     .name           = "rpi",
-    .priv_size      = sizeof(struct priv),
-    .init           = rpi_init,
     .reconfig       = rpi_reconfig,
-    .swap_buffers   = rpi_swap_buffers,
     .control        = rpi_control,
+    .init           = rpi_init,
     .uninit         = rpi_uninit,
-};
-\ No newline at end of file
+};
diff --git a/video/out/opengl/context_vdpau.c b/video/out/opengl/context_vdpau.c
index 40d21ab..e989414 100644
--- a/video/out/opengl/context_vdpau.c
+++ b/video/out/opengl/context_vdpau.c
@@ -26,8 +26,6 @@
 // follow it. I'm not sure about the original nvidia headers.
 #define BRAINDEATH(x) ((void *)(uintptr_t)(x))
 
-#define NUM_SURFACES 4
-
 struct surface {
     int w, h;
     VdpOutputSurface surface;
@@ -39,21 +37,22 @@ struct surface {
 };
 
 struct priv {
+    GL gl;
     GLXContext context;
     struct mp_vdpau_ctx *vdp;
     VdpPresentationQueueTarget vdp_target;
     VdpPresentationQueue vdp_queue;
+    struct surface *surfaces;
     int num_surfaces;
-    struct surface surfaces[NUM_SURFACES];
-    int current_surface;
+    int idx_surfaces;
 };
 
 typedef GLXContext (*glXCreateContextAttribsARBProc)
     (Display*, GLXFBConfig, GLXContext, Bool, const int*);
 
-static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
+static bool create_context_x11(struct ra_ctx *ctx)
 {
-    struct priv *glx_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
     int glx_major, glx_minor;
@@ -62,6 +61,9 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
+    if (!ra_gl_ctx_test_version(ctx, MPGL_VER(glx_major, glx_minor), false))
+        return false;
+
     int glx_attribs[] = {
         GLX_X_RENDERABLE, True,
         GLX_X_VISUAL_TYPE, GLX_TRUE_COLOR,
@@ -96,7 +98,7 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
-    int ctx_flags = vo_flags & VOFLAG_GL_DEBUG ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
+    int ctx_flags = ctx->opts.debug ? GLX_CONTEXT_DEBUG_BIT_ARB : 0;
     int context_attribs[] = {
         GLX_CONTEXT_MAJOR_VERSION_ARB, 4,
         GLX_CONTEXT_MINOR_VERSION_ARB, 0,
@@ -117,19 +119,20 @@ static bool create_context_x11(struct MPGLContext *ctx, int vo_flags)
         return false;
     }
 
-    glx_ctx->context = context;
-    mpgl_load_functions(ctx->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
+    p->context = context;
+    mpgl_load_functions(&p->gl, (void *)glXGetProcAddressARB, glxstr, vo->log);
     return true;
 }
 
-static int create_vdpau_objects(struct MPGLContext *ctx)
+static int create_vdpau_objects(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    struct GL *gl = &p->gl;
     VdpDevice dev = p->vdp->vdp_device;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
 
-    ctx->gl->VDPAUInitNV(BRAINDEATH(dev), p->vdp->get_proc_address);
+    gl->VDPAUInitNV(BRAINDEATH(dev), p->vdp->get_proc_address);
 
     vdp_st = vdp->presentation_queue_target_create_x11(dev, ctx->vo->x11->window,
                                                        &p->vdp_target);
@@ -141,13 +144,13 @@ static int create_vdpau_objects(struct MPGLContext *ctx)
     return 0;
 }
 
-static void destroy_vdpau_surface(struct MPGLContext *ctx,
+static void destroy_vdpau_surface(struct ra_ctx *ctx,
                                   struct surface *surface)
 {
     struct priv *p = ctx->priv;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     if (surface->mapped)
         gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
@@ -168,14 +171,14 @@ static void destroy_vdpau_surface(struct MPGLContext *ctx,
     };
 }
 
-static int recreate_vdpau_surface(struct MPGLContext *ctx,
-                                  struct surface *surface)
+static bool recreate_vdpau_surface(struct ra_ctx *ctx,
+                                   struct surface *surface)
 {
     struct priv *p = ctx->priv;
     VdpDevice dev = p->vdp->vdp_device;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
+    GL *gl = &p->gl;
 
     destroy_vdpau_surface(ctx, surface);
 
@@ -219,16 +222,37 @@ static int recreate_vdpau_surface(struct MPGLContext *ctx,
     gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
     surface->mapped = false;
 
-    return 0;
+    return true;
 
 error:
     destroy_vdpau_surface(ctx, surface);
-    return -1;
+    return false;
+}
+
+static void vdpau_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    struct vdp_functions *vdp = &p->vdp->vdp;
+    VdpStatus vdp_st;
+
+    // This is the *next* surface we will be rendering to. By delaying the
+    // block_until_idle, we're essentially allowing p->num_surfaces - 1
+    // in-flight surfaces, plus the one currently visible surface.
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    if (surf->surface == VDP_INVALID_HANDLE)
+        return;
+
+    VdpTime prev_vsync_time;
+    vdp_st = vdp->presentation_queue_block_until_surface_idle(p->vdp_queue,
+                                                              surf->surface,
+                                                              &prev_vsync_time);
+    CHECK_VDP_WARNING(ctx, "waiting for surface failed");
 }
 
-static void glx_uninit(MPGLContext *ctx)
+static void vdpau_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
 
     if (p->vdp) {
         struct vdp_functions *vdp = &p->vdp->vdp;
@@ -259,10 +283,12 @@ static void glx_uninit(MPGLContext *ctx)
     vo_x11_uninit(ctx->vo);
 }
 
-static int glx_init(struct MPGLContext *ctx, int flags)
+static const struct ra_swapchain_fns vdpau_swapchain;
+
+static bool vdpau_init(struct ra_ctx *ctx)
 {
     struct vo *vo = ctx->vo;
-    struct priv *p = ctx->priv;
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
 
     p->vdp_queue = VDP_INVALID_HANDLE;
     p->vdp_target = VDP_INVALID_HANDLE;
@@ -280,110 +306,112 @@ static int glx_init(struct MPGLContext *ctx, int flags)
     if (!vo_x11_create_vo_window(vo, NULL, "vdpauglx"))
         goto uninit;
 
-    if (!create_context_x11(ctx, flags))
+    if (!create_context_x11(ctx))
         goto uninit;
 
-    if (!(ctx->gl->mpgl_caps & MPGL_CAP_VDPAU))
+    if (!(p->gl.mpgl_caps & MPGL_CAP_VDPAU))
         goto uninit;
 
     if (create_vdpau_objects(ctx) < 0)
         goto uninit;
 
-    p->num_surfaces = NUM_SURFACES;
+    p->num_surfaces = ctx->opts.swapchain_depth + 1; // +1 for the visible image
+    p->surfaces = talloc_zero_array(p, struct surface, p->num_surfaces);
     for (int n = 0; n < p->num_surfaces; n++)
         p->surfaces[n].surface = VDP_INVALID_HANDLE;
 
-    ctx->flip_v = true;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = vdpau_swap_buffers,
+        .external_swapchain = &vdpau_swapchain,
+        .flipped = true,
+    };
 
-    return 0;
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
-    glx_uninit(ctx);
-    return -1;
+    vdpau_uninit(ctx);
+    return false;
 }
 
-static int glx_reconfig(struct MPGLContext *ctx)
+static bool vdpau_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
-}
+    struct priv *p = sw->ctx->priv;
+    struct vo *vo = sw->ctx->vo;
+    GL *gl = &p->gl;
+
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    if (surf->w != vo->dwidth || surf->h != vo->dheight ||
+        surf->surface == VDP_INVALID_HANDLE)
+    {
+        if (!recreate_vdpau_surface(sw->ctx, surf))
+            return NULL;
+    }
 
-static int glx_control(struct MPGLContext *ctx, int *events, int request,
-                       void *arg)
-{
-    return vo_x11_control(ctx->vo, events, request, arg);
+    assert(!surf->mapped);
+    gl->VDPAUMapSurfacesNV(1, &surf->registered);
+    surf->mapped = true;
+
+    ra_gl_ctx_resize(sw, surf->w, surf->h, surf->fbo);
+    return ra_gl_ctx_start_frame(sw, out_fbo);
 }
 
-static void glx_start_frame(struct MPGLContext *ctx)
+static bool vdpau_submit_frame(struct ra_swapchain *sw,
+                               const struct vo_frame *frame)
 {
-    struct priv *p = ctx->priv;
+    struct priv *p = sw->ctx->priv;
+    GL *gl = &p->gl;
     struct vdp_functions *vdp = &p->vdp->vdp;
     VdpStatus vdp_st;
-    GL *gl = ctx->gl;
-
-    struct surface *surface = &p->surfaces[p->current_surface];
-
-    if (surface->surface != VDP_INVALID_HANDLE) {
-        VdpTime prev_vsync_time;
-        vdp_st = vdp->presentation_queue_block_until_surface_idle(p->vdp_queue,
-                                                                  surface->surface,
-                                                                  &prev_vsync_time);
-        CHECK_VDP_WARNING(ctx, "waiting for surface failed");
-    }
 
-    if (surface->w != ctx->vo->dwidth || surface->h != ctx->vo->dheight)
-        recreate_vdpau_surface(ctx, surface);
+    struct surface *surf = &p->surfaces[p->idx_surfaces];
+    assert(surf->surface != VDP_INVALID_HANDLE);
+    assert(surf->mapped);
+    gl->VDPAUUnmapSurfacesNV(1, &surf->registered);
+    surf->mapped = false;
 
+    vdp_st = vdp->presentation_queue_display(p->vdp_queue, surf->surface, 0, 0, 0);
+    CHECK_VDP_WARNING(sw->ctx, "trying to present vdp surface");
 
-    ctx->main_fb = surface->fbo; // 0 if creating the surface failed
-
-    if (surface->surface != VDP_INVALID_HANDLE) {
-        gl->VDPAUMapSurfacesNV(1, &surface->registered);
-        surface->mapped = true;
-    }
+    p->idx_surfaces = (p->idx_surfaces + 1) % p->num_surfaces;
+    return ra_gl_ctx_submit_frame(sw, frame) && vdp_st == VDP_STATUS_OK;
 }
 
-static void glx_swap_buffers(struct MPGLContext *ctx)
+static bool vdpau_reconfig(struct ra_ctx *ctx)
 {
-    struct priv *p = ctx->priv;
-    struct vdp_functions *vdp = &p->vdp->vdp;
-    VdpStatus vdp_st;
-    GL *gl = ctx->gl;
-
-    struct surface *surface = &p->surfaces[p->current_surface];
-    if (surface->surface == VDP_INVALID_HANDLE)
-        return; // surface alloc probably failed before
-
-    if (surface->mapped)
-        gl->VDPAUUnmapSurfacesNV(1, &surface->registered);
-    surface->mapped = false;
-
-    vdp_st = vdp->presentation_queue_display(p->vdp_queue, surface->surface,
-                                             0, 0, 0);
-    CHECK_VDP_WARNING(ctx, "trying to present vdp surface");
+    vo_x11_config_vo_window(ctx->vo);
+    return true;
+}
 
-    p->current_surface = (p->current_surface + 1) % p->num_surfaces;
+static int vdpau_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    return vo_x11_control(ctx->vo, events, request, arg);
 }
 
-static void glx_wakeup(struct MPGLContext *ctx)
+static void vdpau_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void glx_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void vdpau_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_vdpauglx = {
+static const struct ra_swapchain_fns vdpau_swapchain = {
+    .start_frame   = vdpau_start_frame,
+    .submit_frame  = vdpau_submit_frame,
+};
+
+const struct ra_ctx_fns ra_ctx_vdpauglx = {
+    .type           = "opengl",
     .name           = "vdpauglx",
-    .priv_size      = sizeof(struct priv),
-    .init           = glx_init,
-    .reconfig       = glx_reconfig,
-    .start_frame    = glx_start_frame,
-    .swap_buffers   = glx_swap_buffers,
-    .control        = glx_control,
-    .wakeup         = glx_wakeup,
-    .wait_events    = glx_wait_events,
-    .uninit         = glx_uninit,
+    .reconfig       = vdpau_reconfig,
+    .control        = vdpau_control,
+    .wakeup         = vdpau_wakeup,
+    .wait_events    = vdpau_wait_events,
+    .init           = vdpau_init,
+    .uninit         = vdpau_uninit,
 };
diff --git a/video/out/opengl/context_wayland.c b/video/out/opengl/context_wayland.c
index 87e98cd..f686fcc 100644
--- a/video/out/opengl/context_wayland.c
+++ b/video/out/opengl/context_wayland.c
@@ -16,189 +16,166 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <wayland-egl.h>
+#include <EGL/egl.h>
+#include <EGL/eglext.h>
+
 #include "video/out/wayland_common.h"
 #include "context.h"
 #include "egl_helpers.h"
+#include "utils.h"
+
+struct priv {
+    GL gl;
+    EGLDisplay egl_display;
+    EGLContext egl_context;
+    EGLSurface egl_surface;
+    EGLConfig  egl_config;
+    struct wl_egl_window *egl_window;
+};
 
-static void egl_resize(struct vo_wayland_state *wl)
+static void resize(struct ra_ctx *ctx)
 {
-    int32_t x = wl->window.sh_x;
-    int32_t y = wl->window.sh_y;
-    int32_t width = wl->window.sh_width;
-    int32_t height = wl->window.sh_height;
-    int32_t scale = 1;
-
-    if (!wl->egl_context.egl_window)
-        return;
-
-    if (wl->display.current_output)
-        scale = wl->display.current_output->scale;
-
-    // get the real size of the window
-    // this improves moving the window while resizing it
-    wl_egl_window_get_attached_size(wl->egl_context.egl_window,
-                                    &wl->window.width,
-                                    &wl->window.height);
+    struct priv *p = ctx->priv;
+    struct vo_wayland_state *wl = ctx->vo->wl;
 
-    MP_VERBOSE(wl, "resizing %dx%d -> %dx%d\n", wl->window.width,
-                                                wl->window.height,
-                                                width,
-                                                height);
+    MP_VERBOSE(wl, "Handling resize on the egl side\n");
 
-    if (x != 0)
-        x = wl->window.width - width;
+    const int32_t width = wl->scaling*mp_rect_w(wl->geometry);
+    const int32_t height = wl->scaling*mp_rect_h(wl->geometry);
 
-    if (y != 0)
-        y = wl->window.height - height;
+    wl_surface_set_buffer_scale(wl->surface, wl->scaling);
+    wl_egl_window_resize(p->egl_window, width, height, 0, 0);
 
-    wl_surface_set_buffer_scale(wl->window.video_surface, scale);
-    wl_egl_window_resize(wl->egl_context.egl_window, scale*width, scale*height, x, y);
-
-    wl->window.width = width;
-    wl->window.height = height;
+    wl->vo->dwidth  = width;
+    wl->vo->dheight = height;
+}
 
-    /* set size for mplayer */
-    wl->vo->dwidth  = scale*wl->window.width;
-    wl->vo->dheight = scale*wl->window.height;
-    wl->vo->want_redraw = true;
+static void wayland_egl_swap_buffers(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
 }
 
-static int egl_create_context(struct vo_wayland_state *wl, MPGLContext *ctx,
-                              int flags)
+static bool egl_create_context(struct ra_ctx *ctx)
 {
-    GL *gl = ctx->gl;
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    struct vo_wayland_state *wl = ctx->vo->wl;
 
-    if (!(wl->egl_context.egl.dpy = eglGetDisplay(wl->display.display)))
-        return -1;
+    if (!(p->egl_display = eglGetDisplay(wl->display)))
+        return false;
 
-    if (eglInitialize(wl->egl_context.egl.dpy, NULL, NULL) != EGL_TRUE)
-        return -1;
+    if (eglInitialize(p->egl_display, NULL, NULL) != EGL_TRUE)
+        return false;
 
-    if (!mpegl_create_context(wl->egl_context.egl.dpy, wl->log, flags,
-                              &wl->egl_context.egl.ctx,
-                              &wl->egl_context.egl.conf))
-        return -1;
+    if (!mpegl_create_context(ctx, p->egl_display, &p->egl_context,
+                              &p->egl_config))
+        return false;
 
-    eglMakeCurrent(wl->egl_context.egl.dpy, NULL, NULL, wl->egl_context.egl.ctx);
+    eglMakeCurrent(p->egl_display, NULL, NULL, p->egl_context);
 
-    mpegl_load_functions(gl, wl->log);
+    mpegl_load_functions(&p->gl, wl->log);
 
-    ctx->native_display_type = "wl";
-    ctx->native_display = wl->display.display;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = wayland_egl_swap_buffers,
+        .native_display_type = "wl",
+        .native_display = wl->display,
+    };
 
-    return 0;
-}
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        return false;
 
-static void egl_create_window(struct vo_wayland_state *wl)
-{
-    wl->egl_context.egl_window = wl_egl_window_create(wl->window.video_surface,
-                                                      wl->window.width,
-                                                      wl->window.height);
-
-    wl->egl_context.egl_surface = eglCreateWindowSurface(wl->egl_context.egl.dpy,
-                                                         wl->egl_context.egl.conf,
-                                                         wl->egl_context.egl_window,
-                                                         NULL);
-
-    eglMakeCurrent(wl->egl_context.egl.dpy,
-                   wl->egl_context.egl_surface,
-                   wl->egl_context.egl_surface,
-                   wl->egl_context.egl.ctx);
-
-    wl_display_dispatch_pending(wl->display.display);
-
-    /**
-     * <http://lists.freedesktop.org/archives/wayland-devel/2013-November/012019.html>
-     *
-     * The main change is that if the swap interval is 0 then Mesa won't install a
-     * frame callback so that eglSwapBuffers can be executed as often as necessary.
-     * Instead it will do a sync request after the swap buffers. It will block for
-     * sync complete event in get_back_bo instead of the frame callback. The
-     * compositor is likely to send a release event while processing the new buffer
-     * attach and this makes sure we will receive that before deciding whether to
-     * allocate a new buffer.
-     */
-
-    eglSwapInterval(wl->egl_context.egl.dpy, 0);
+    return true;
 }
 
-static int waylandgl_reconfig(struct MPGLContext *ctx)
+static void egl_create_window(struct ra_ctx *ctx)
 {
-    struct vo_wayland_state * wl = ctx->vo->wayland;
+    struct priv *p = ctx->priv;
+    struct vo_wayland_state *wl = ctx->vo->wl;
 
-    if (!vo_wayland_config(ctx->vo))
-        return -1;
+    p->egl_window = wl_egl_window_create(wl->surface, mp_rect_w(wl->geometry),
+                                         mp_rect_h(wl->geometry));
 
-    if (!wl->egl_context.egl_window)
-        egl_create_window(wl);
+    p->egl_surface = eglCreateWindowSurface(p->egl_display, p->egl_config,
+                                            p->egl_window, NULL);
 
-    return 0;
+    eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface, p->egl_context);
+
+    eglSwapInterval(p->egl_display, 0);
 }
 
-static void waylandgl_uninit(MPGLContext *ctx)
+static bool wayland_egl_reconfig(struct ra_ctx *ctx)
 {
-    struct vo_wayland_state *wl = ctx->vo->wayland;
+    struct priv *p = ctx->priv;
 
-    if (wl->egl_context.egl.ctx) {
-        eglReleaseThread();
-        if (wl->egl_context.egl_window)
-            wl_egl_window_destroy(wl->egl_context.egl_window);
-        eglDestroySurface(wl->egl_context.egl.dpy, wl->egl_context.egl_surface);
-        eglMakeCurrent(wl->egl_context.egl.dpy, NULL, NULL, EGL_NO_CONTEXT);
-        eglDestroyContext(wl->egl_context.egl.dpy, wl->egl_context.egl.ctx);
-    }
-    eglTerminate(wl->egl_context.egl.dpy);
-    wl->egl_context.egl.ctx = NULL;
+    if (!vo_wayland_reconfig(ctx->vo))
+        return false;
 
-    vo_wayland_uninit(ctx->vo);
+    if (!p->egl_window)
+        egl_create_window(ctx);
+
+    return true;
 }
 
-static void waylandgl_swap_buffers(MPGLContext *ctx)
+static void wayland_egl_uninit(struct ra_ctx *ctx)
 {
-    struct vo_wayland_state *wl = ctx->vo->wayland;
+    struct priv *p = ctx->priv;
 
-    vo_wayland_wait_events(ctx->vo, 0);
+    ra_gl_ctx_uninit(ctx);
 
-    eglSwapBuffers(wl->egl_context.egl.dpy, wl->egl_context.egl_surface);
+    if (p->egl_context) {
+        eglReleaseThread();
+        if (p->egl_window)
+            wl_egl_window_destroy(p->egl_window);
+        eglDestroySurface(p->egl_display, p->egl_surface);
+        eglMakeCurrent(p->egl_display, NULL, NULL, EGL_NO_CONTEXT);
+        eglDestroyContext(p->egl_display, p->egl_context);
+        p->egl_context = NULL;
+    }
+    eglTerminate(p->egl_display);
+
+    vo_wayland_uninit(ctx->vo);
 }
 
-static int waylandgl_control(MPGLContext *ctx, int *events, int request,
+static int wayland_egl_control(struct ra_ctx *ctx, int *events, int request,
                              void *data)
 {
-    struct vo_wayland_state *wl = ctx->vo->wayland;
+    struct vo_wayland_state *wl = ctx->vo->wl;
     int r = vo_wayland_control(ctx->vo, events, request, data);
 
-    if (*events & VO_EVENT_RESIZE)
-        egl_resize(wl);
+    if (*events & VO_EVENT_RESIZE) {
+        resize(ctx);
+        ra_gl_ctx_resize(ctx->swapchain, wl->vo->dwidth, wl->vo->dheight, 0);
+    }
 
     return r;
 }
 
-static void wayland_wakeup(struct MPGLContext *ctx)
+static void wayland_egl_wakeup(struct ra_ctx *ctx)
 {
     vo_wayland_wakeup(ctx->vo);
 }
 
-static void wayland_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void wayland_egl_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_wayland_wait_events(ctx->vo, until_time_us);
 }
 
-static int waylandgl_init(struct MPGLContext *ctx, int flags)
+static bool wayland_egl_init(struct ra_ctx *ctx)
 {
     if (!vo_wayland_init(ctx->vo))
-        return -1;
+        return false;
 
-    return egl_create_context(ctx->vo->wayland, ctx, flags);
+    return egl_create_context(ctx);
 }
 
-const struct mpgl_driver mpgl_driver_wayland = {
+const struct ra_ctx_fns ra_ctx_wayland_egl = {
+    .type           = "opengl",
     .name           = "wayland",
-    .init           = waylandgl_init,
-    .reconfig       = waylandgl_reconfig,
-    .swap_buffers   = waylandgl_swap_buffers,
-    .control        = waylandgl_control,
-    .wakeup         = wayland_wakeup,
-    .wait_events    = wayland_wait_events,
-    .uninit         = waylandgl_uninit,
+    .reconfig       = wayland_egl_reconfig,
+    .control        = wayland_egl_control,
+    .wakeup         = wayland_egl_wakeup,
+    .wait_events    = wayland_egl_wait_events,
+    .init           = wayland_egl_init,
+    .uninit         = wayland_egl_uninit,
 };
diff --git a/video/out/opengl/context_w32.c b/video/out/opengl/context_win.c
index eb61239..5a0042b 100644
--- a/video/out/opengl/context_w32.c
+++ b/video/out/opengl/context_win.c
@@ -21,8 +21,8 @@
 
 #include "options/m_config.h"
 #include "video/out/w32_common.h"
-#include "video/out/win32/exclusive_hack.h"
 #include "context.h"
+#include "utils.h"
 
 #if !defined(WGL_CONTEXT_MAJOR_VERSION_ARB)
 /* these are supposed to be defined in wingdi.h but mingw's is too old */
@@ -37,7 +37,9 @@
 #define WGL_CONTEXT_CORE_PROFILE_BIT_ARB   0x00000001
 #endif
 
-struct w32_context {
+struct priv {
+    GL gl;
+
     int opt_swapinterval;
     int current_swapinterval;
 
@@ -45,26 +47,25 @@ struct w32_context {
 
     HGLRC context;
     HDC hdc;
-    int flags;
 };
 
-static void w32_uninit(MPGLContext *ctx);
+static void wgl_uninit(struct ra_ctx *ctx);
 
-static __thread struct w32_context *current_w32_context;
+static __thread struct priv *current_wgl_context;
 
-static int GLAPIENTRY w32_swap_interval(int interval)
+static int GLAPIENTRY wgl_swap_interval(int interval)
 {
-    if (current_w32_context)
-        current_w32_context->opt_swapinterval = interval;
+    if (current_wgl_context)
+        current_wgl_context->opt_swapinterval = interval;
     return 0;
 }
 
-static bool create_dc(struct MPGLContext *ctx, int flags)
+static bool create_dc(struct ra_ctx *ctx)
 {
-    struct w32_context *w32_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
     HWND win = vo_w32_hwnd(ctx->vo);
 
-    if (w32_ctx->hdc)
+    if (p->hdc)
         return true;
 
     HDC hdc = GetDC(win);
@@ -90,11 +91,11 @@ static bool create_dc(struct MPGLContext *ctx, int flags)
 
     SetPixelFormat(hdc, pf, &pfd);
 
-    w32_ctx->hdc = hdc;
+    p->hdc = hdc;
     return true;
 }
 
-static void *w32gpa(const GLubyte *procName)
+static void *wglgpa(const GLubyte *procName)
 {
     HMODULE oglmod;
     void *res = wglGetProcAddress(procName);
@@ -104,11 +105,11 @@ static void *w32gpa(const GLubyte *procName)
     return GetProcAddress(oglmod, procName);
 }
 
-static bool create_context_w32_old(struct MPGLContext *ctx)
+static bool create_context_wgl_old(struct ra_ctx *ctx)
 {
-    struct w32_context *w32_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
 
-    HDC windc = w32_ctx->hdc;
+    HDC windc = p->hdc;
     bool res = false;
 
     HGLRC context = wglCreateContext(windc);
@@ -123,17 +124,15 @@ static bool create_context_w32_old(struct MPGLContext *ctx)
         return res;
     }
 
-    w32_ctx->context = context;
-
-    mpgl_load_functions(ctx->gl, w32gpa, NULL, ctx->vo->log);
+    p->context = context;
     return true;
 }
 
-static bool create_context_w32_gl3(struct MPGLContext *ctx)
+static bool create_context_wgl_gl3(struct ra_ctx *ctx)
 {
-    struct w32_context *w32_ctx = ctx->priv;
+    struct priv *p = ctx->priv;
 
-    HDC windc = w32_ctx->hdc;
+    HDC windc = p->hdc;
     HGLRC context = 0;
 
     // A legacy context is needed to get access to the new functions.
@@ -150,7 +149,7 @@ static bool create_context_w32_gl3(struct MPGLContext *ctx)
     }
 
     const char *(GLAPIENTRY *wglGetExtensionsStringARB)(HDC hdc)
-        = w32gpa((const GLubyte*)"wglGetExtensionsStringARB");
+        = wglgpa((const GLubyte*)"wglGetExtensionsStringARB");
 
     if (!wglGetExtensionsStringARB)
         goto unsupported;
@@ -161,7 +160,7 @@ static bool create_context_w32_gl3(struct MPGLContext *ctx)
 
     HGLRC (GLAPIENTRY *wglCreateContextAttribsARB)(HDC hDC, HGLRC hShareContext,
                                                    const int *attribList)
-        = w32gpa((const GLubyte*)"wglCreateContextAttribsARB");
+        = wglgpa((const GLubyte*)"wglCreateContextAttribsARB");
 
     if (!wglCreateContextAttribsARB)
         goto unsupported;
@@ -197,11 +196,7 @@ static bool create_context_w32_gl3(struct MPGLContext *ctx)
         return false;
     }
 
-    w32_ctx->context = context;
-
-    /* update function pointers */
-    mpgl_load_functions(ctx->gl, w32gpa, NULL, ctx->vo->log);
-
+    p->context = context;
     return true;
 
 unsupported:
@@ -214,79 +209,20 @@ out:
 
 static void create_ctx(void *ptr)
 {
-    struct MPGLContext *ctx = ptr;
-    struct w32_context *w32_ctx = ctx->priv;
+    struct ra_ctx *ctx = ptr;
+    struct priv *p = ctx->priv;
 
-    if (!create_dc(ctx, w32_ctx->flags))
+    if (!create_dc(ctx))
         return;
 
-    create_context_w32_gl3(ctx);
-    if (!w32_ctx->context)
-        create_context_w32_old(ctx);
-
-    wglMakeCurrent(w32_ctx->hdc, NULL);
-}
-
-static int w32_init(struct MPGLContext *ctx, int flags)
-{
-    if (!vo_w32_init(ctx->vo))
-        goto fail;
-
-    struct w32_context *w32_ctx = ctx->priv;
+    create_context_wgl_gl3(ctx);
+    if (!p->context)
+        create_context_wgl_old(ctx);
 
-    w32_ctx->flags = flags;
-    vo_w32_run_on_thread(ctx->vo, create_ctx, ctx);
-
-    if (!w32_ctx->context)
-        goto fail;
-
-    if (!ctx->gl->SwapInterval)
-        MP_VERBOSE(ctx->vo, "WGL_EXT_swap_control missing.\n");
-    w32_ctx->real_wglSwapInterval = ctx->gl->SwapInterval;
-    ctx->gl->SwapInterval = w32_swap_interval;
-    w32_ctx->current_swapinterval = -1;
-
-    current_w32_context = w32_ctx;
-    wglMakeCurrent(w32_ctx->hdc, w32_ctx->context);
-    DwmEnableMMCSS(TRUE);
-    return 0;
-
-fail:
-    w32_uninit(ctx);
-    return -1;
+    wglMakeCurrent(p->hdc, NULL);
 }
 
-static int w32_reconfig(struct MPGLContext *ctx)
-{
-    vo_w32_config(ctx->vo);
-    return 0;
-}
-
-static void destroy_gl(void *ptr)
-{
-    struct MPGLContext *ctx = ptr;
-    struct w32_context *w32_ctx = ctx->priv;
-    if (w32_ctx->context)
-        wglDeleteContext(w32_ctx->context);
-    w32_ctx->context = 0;
-    if (w32_ctx->hdc)
-        ReleaseDC(vo_w32_hwnd(ctx->vo), w32_ctx->hdc);
-    w32_ctx->hdc = NULL;
-    current_w32_context = NULL;
-}
-
-static void w32_uninit(MPGLContext *ctx)
-{
-    struct w32_context *w32_ctx = ctx->priv;
-    if (w32_ctx->context)
-        wglMakeCurrent(w32_ctx->hdc, 0);
-    vo_w32_run_on_thread(ctx->vo, destroy_gl, ctx);
-
-    DwmEnableMMCSS(FALSE);
-    vo_w32_uninit(ctx->vo);
-}
-
-static bool compositor_active(MPGLContext *ctx)
+static bool compositor_active(struct ra_ctx *ctx)
 {
     // For Windows 7.
     BOOL enabled = 0;
@@ -300,21 +236,16 @@ static bool compositor_active(MPGLContext *ctx)
     if (FAILED(DwmGetCompositionTimingInfo(0, &info)))
         return false;
 
-    // Test if a program is running in exclusive fullscreen mode. If so, it's
-    // probably this one, so it's not getting redirected by the compositor.
-    if (mp_w32_is_in_exclusive_mode())
-        return false;
-
     return true;
 }
 
-static void w32_swap_buffers(MPGLContext *ctx)
+static void wgl_swap_buffers(struct ra_ctx *ctx)
 {
-    struct w32_context *w32_ctx = ctx->priv;
-    SwapBuffers(w32_ctx->hdc);
+    struct priv *p = ctx->priv;
+    SwapBuffers(p->hdc);
 
     // default if we don't DwmFLush
-    int new_swapinterval = w32_ctx->opt_swapinterval;
+    int new_swapinterval = p->opt_swapinterval;
 
     int dwm_flush_opt;
     mp_read_option_raw(ctx->global, "opengl-dwmflush", &m_option_type_choice,
@@ -330,26 +261,103 @@ static void w32_swap_buffers(MPGLContext *ctx)
         }
     }
 
-    if (new_swapinterval != w32_ctx->current_swapinterval &&
-        w32_ctx->real_wglSwapInterval)
+    if (new_swapinterval != p->current_swapinterval &&
+        p->real_wglSwapInterval)
     {
-        w32_ctx->real_wglSwapInterval(new_swapinterval);
+        p->real_wglSwapInterval(new_swapinterval);
         MP_VERBOSE(ctx->vo, "set SwapInterval(%d)\n", new_swapinterval);
     }
-    w32_ctx->current_swapinterval = new_swapinterval;
+    p->current_swapinterval = new_swapinterval;
+}
+
+static bool wgl_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    GL *gl = &p->gl;
+
+    if (!vo_w32_init(ctx->vo))
+        goto fail;
+
+    vo_w32_run_on_thread(ctx->vo, create_ctx, ctx);
+    if (!p->context)
+        goto fail;
+
+    current_wgl_context = p;
+    wglMakeCurrent(p->hdc, p->context);
+
+    mpgl_load_functions(gl, wglgpa, NULL, ctx->vo->log);
+
+    if (!gl->SwapInterval)
+        MP_VERBOSE(ctx->vo, "WGL_EXT_swap_control missing.\n");
+    p->real_wglSwapInterval = gl->SwapInterval;
+    gl->SwapInterval = wgl_swap_interval;
+    p->current_swapinterval = -1;
+
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = wgl_swap_buffers,
+    };
+
+    if (!ra_gl_ctx_init(ctx, gl, params))
+        goto fail;
+
+    DwmEnableMMCSS(TRUE);
+    return true;
+
+fail:
+    wgl_uninit(ctx);
+    return false;
+}
+
+static void resize(struct ra_ctx *ctx)
+{
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
+}
+
+static bool wgl_reconfig(struct ra_ctx *ctx)
+{
+    vo_w32_config(ctx->vo);
+    resize(ctx);
+    return true;
+}
+
+static void destroy_gl(void *ptr)
+{
+    struct ra_ctx *ctx = ptr;
+    struct priv *p = ctx->priv;
+    if (p->context)
+        wglDeleteContext(p->context);
+    p->context = 0;
+    if (p->hdc)
+        ReleaseDC(vo_w32_hwnd(ctx->vo), p->hdc);
+    p->hdc = NULL;
+    current_wgl_context = NULL;
+}
+
+static void wgl_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+    if (p->context)
+        wglMakeCurrent(p->hdc, 0);
+    vo_w32_run_on_thread(ctx->vo, destroy_gl, ctx);
+
+    DwmEnableMMCSS(FALSE);
+    vo_w32_uninit(ctx->vo);
 }
 
-static int w32_control(MPGLContext *ctx, int *events, int request, void *arg)
+static int wgl_control(struct ra_ctx *ctx, int *events, int request, void *arg)
 {
-    return vo_w32_control(ctx->vo, events, request, arg);
+    int ret = vo_w32_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-const struct mpgl_driver mpgl_driver_w32 = {
+const struct ra_ctx_fns ra_ctx_wgl = {
+    .type           = "opengl",
     .name           = "win",
-    .priv_size      = sizeof(struct w32_context),
-    .init           = w32_init,
-    .reconfig       = w32_reconfig,
-    .swap_buffers   = w32_swap_buffers,
-    .control        = w32_control,
-    .uninit         = w32_uninit,
+    .init           = wgl_init,
+    .reconfig       = wgl_reconfig,
+    .control        = wgl_control,
+    .uninit         = wgl_uninit,
 };
diff --git a/video/out/opengl/context_x11egl.c b/video/out/opengl/context_x11egl.c
index 2b68007..7ab4fe0 100644
--- a/video/out/opengl/context_x11egl.c
+++ b/video/out/opengl/context_x11egl.c
@@ -32,14 +32,17 @@
 #include "egl_helpers.h"
 
 struct priv {
+    GL gl;
     EGLDisplay egl_display;
     EGLContext egl_context;
     EGLSurface egl_surface;
 };
 
-static void mpegl_uninit(MPGLContext *ctx)
+static void mpegl_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    ra_gl_ctx_uninit(ctx);
+
     if (p->egl_context) {
         eglMakeCurrent(p->egl_display, EGL_NO_SURFACE, EGL_NO_SURFACE,
                        EGL_NO_CONTEXT);
@@ -51,7 +54,7 @@ static void mpegl_uninit(MPGLContext *ctx)
 
 static int pick_xrgba_config(void *user_data, EGLConfig *configs, int num_configs)
 {
-    struct MPGLContext *ctx = user_data;
+    struct ra_ctx *ctx = user_data;
     struct priv *p = ctx->priv;
     struct vo *vo = ctx->vo;
 
@@ -72,40 +75,44 @@ static int pick_xrgba_config(void *user_data, EGLConfig *configs, int num_config
     return 0;
 }
 
-static int mpegl_init(struct MPGLContext *ctx, int flags)
+static void mpegl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    eglSwapBuffers(p->egl_display, p->egl_surface);
+}
+
+static bool mpegl_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
     struct vo *vo = ctx->vo;
-    int msgl = vo->probing ? MSGL_V : MSGL_FATAL;
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_FATAL;
 
     if (!vo_x11_init(vo))
         goto uninit;
 
     p->egl_display = eglGetDisplay(vo->x11->display);
     if (!eglInitialize(p->egl_display, NULL, NULL)) {
-        mp_msg(vo->log, msgl, "Could not initialize EGL.\n");
+        MP_MSG(ctx, msgl, "Could not initialize EGL.\n");
         goto uninit;
     }
 
-    struct mpegl_opts opts = {
-        .vo_flags = flags,
+    struct mpegl_cb cb = {
         .user_data = ctx,
-        .refine_config = (flags & VOFLAG_ALPHA) ? pick_xrgba_config : NULL,
+        .refine_config = ctx->opts.want_alpha ? pick_xrgba_config : NULL,
     };
 
     EGLConfig config;
-    if (!mpegl_create_context_opts(p->egl_display, vo->log, &opts,
-                                   &p->egl_context, &config))
+    if (!mpegl_create_context_cb(ctx, p->egl_display, cb, &p->egl_context, &config))
         goto uninit;
 
     int vID, n;
     eglGetConfigAttrib(p->egl_display, config, EGL_NATIVE_VISUAL_ID, &vID);
-    MP_VERBOSE(vo, "chose visual 0x%x\n", vID);
+    MP_VERBOSE(ctx, "chose visual 0x%x\n", vID);
     XVisualInfo template = {.visualid = vID};
     XVisualInfo *vi = XGetVisualInfo(vo->x11->display, VisualIDMask, &template, &n);
 
     if (!vi) {
-        MP_FATAL(vo, "Getting X visual failed!\n");
+        MP_FATAL(ctx, "Getting X visual failed!\n");
         goto uninit;
     }
 
@@ -120,64 +127,73 @@ static int mpegl_init(struct MPGLContext *ctx, int flags)
                                     (EGLNativeWindowType)vo->x11->window, NULL);
 
     if (p->egl_surface == EGL_NO_SURFACE) {
-        MP_FATAL(ctx->vo, "Could not create EGL surface!\n");
+        MP_FATAL(ctx, "Could not create EGL surface!\n");
         goto uninit;
     }
 
     if (!eglMakeCurrent(p->egl_display, p->egl_surface, p->egl_surface,
                         p->egl_context))
     {
-        MP_FATAL(ctx->vo, "Could not make context current!\n");
+        MP_FATAL(ctx, "Could not make context current!\n");
         goto uninit;
     }
 
-    mpegl_load_functions(ctx->gl, vo->log);
+    mpegl_load_functions(&p->gl, ctx->log);
 
-    ctx->native_display_type = "x11";
-    ctx->native_display = vo->x11->display;
-    return 0;
+    struct ra_gl_ctx_params params = {
+        .swap_buffers = mpegl_swap_buffers,
+        .native_display_type = "x11",
+        .native_display = vo->x11->display,
+    };
+
+    if (!ra_gl_ctx_init(ctx, &p->gl, params))
+        goto uninit;
+
+    return true;
 
 uninit:
     mpegl_uninit(ctx);
-    return -1;
+    return false;
 }
 
-static int mpegl_reconfig(struct MPGLContext *ctx)
+static void resize(struct ra_ctx *ctx)
 {
-    vo_x11_config_vo_window(ctx->vo);
-    return 0;
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
 }
 
-static int mpegl_control(struct MPGLContext *ctx, int *events, int request,
-                         void *arg)
+static bool mpegl_reconfig(struct ra_ctx *ctx)
 {
-    return vo_x11_control(ctx->vo, events, request, arg);
+    vo_x11_config_vo_window(ctx->vo);
+    resize(ctx);
+    return true;
 }
 
-static void mpegl_swap_buffers(MPGLContext *ctx)
+static int mpegl_control(struct ra_ctx *ctx, int *events, int request,
+                         void *arg)
 {
-    struct priv *p = ctx->priv;
-    eglSwapBuffers(p->egl_display, p->egl_surface);
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE)
+        resize(ctx);
+    return ret;
 }
 
-static void mpegl_wakeup(struct MPGLContext *ctx)
+static void mpegl_wakeup(struct ra_ctx *ctx)
 {
     vo_x11_wakeup(ctx->vo);
 }
 
-static void mpegl_wait_events(struct MPGLContext *ctx, int64_t until_time_us)
+static void mpegl_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
 {
     vo_x11_wait_events(ctx->vo, until_time_us);
 }
 
-const struct mpgl_driver mpgl_driver_x11egl = {
+const struct ra_ctx_fns ra_ctx_x11_egl = {
+    .type           = "opengl",
     .name           = "x11egl",
-    .priv_size      = sizeof(struct priv),
-    .init           = mpegl_init,
     .reconfig       = mpegl_reconfig,
-    .swap_buffers   = mpegl_swap_buffers,
     .control        = mpegl_control,
     .wakeup         = mpegl_wakeup,
     .wait_events    = mpegl_wait_events,
+    .init           = mpegl_init,
     .uninit         = mpegl_uninit,
 };
diff --git a/video/out/opengl/egl_helpers.c b/video/out/opengl/egl_helpers.c
index ac152df..0033bf1 100644
--- a/video/out/opengl/egl_helpers.c
+++ b/video/out/opengl/egl_helpers.c
@@ -25,6 +25,7 @@
 
 #include "egl_helpers.h"
 #include "common.h"
+#include "utils.h"
 #include "context.h"
 
 #if HAVE_EGL_ANGLE
@@ -43,41 +44,49 @@
 #define EGL_OPENGL_ES3_BIT                      0x00000040
 #endif
 
-// es_version = 0 (desktop), 2/3 (ES major version)
-static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
-                           int es_version, struct mpegl_opts *opts,
+// es_version: 0 (core), 2 or 3
+static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
+                           int es_version, struct mpegl_cb cb,
                            EGLContext *out_context, EGLConfig *out_config)
 {
-    int msgl = probing ? MSGL_V : MSGL_FATAL;
-
-    EGLenum api = EGL_OPENGL_API;
-    EGLint rend = EGL_OPENGL_BIT;
-    const char *name = "Desktop OpenGL";
-    if (es_version == 2) {
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_FATAL;
+
+    EGLenum api;
+    EGLint rend;
+    const char *name;
+
+    switch (es_version) {
+    case 0:
+        api = EGL_OPENGL_API;
+        rend = EGL_OPENGL_BIT;
+        name = "Desktop OpenGL";
+        break;
+    case 2:
         api = EGL_OPENGL_ES_API;
         rend = EGL_OPENGL_ES2_BIT;
-        name = "GLES 2.0";
-    }
-    if (es_version == 3) {
+        name = "GLES 2.x";
+        break;
+    case 3:
         api = EGL_OPENGL_ES_API;
         rend = EGL_OPENGL_ES3_BIT;
         name = "GLES 3.x";
+        break;
+    default: abort();
     }
 
-    mp_msg(log, MSGL_V, "Trying to create %s context.\n", name);
+    MP_VERBOSE(ctx, "Trying to create %s context.\n", name);
 
     if (!eglBindAPI(api)) {
-        mp_msg(log, MSGL_V, "Could not bind API!\n");
+        MP_VERBOSE(ctx, "Could not bind API!\n");
         return false;
     }
 
-
     EGLint attributes[] = {
         EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
         EGL_RED_SIZE, 1,
         EGL_GREEN_SIZE, 1,
         EGL_BLUE_SIZE, 1,
-        EGL_ALPHA_SIZE, (opts->vo_flags & VOFLAG_ALPHA ) ? 1 : 0,
+        EGL_ALPHA_SIZE, ctx->opts.want_alpha ? 1 : 0,
         EGL_RENDERABLE_TYPE, rend,
         EGL_NONE
     };
@@ -92,29 +101,34 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
 
     if (!num_configs) {
         talloc_free(configs);
-        mp_msg(log, msgl, "Could not choose EGLConfig!\n");
+        MP_MSG(ctx, msgl, "Could not choose EGLConfig!\n");
         return false;
     }
 
     int chosen = 0;
-    if (opts->refine_config)
-        chosen = opts->refine_config(opts->user_data, configs, num_configs);
+    if (cb.refine_config)
+        chosen = cb.refine_config(cb.user_data, configs, num_configs);
     EGLConfig config = configs[chosen];
 
     talloc_free(configs);
 
-    EGLContext *ctx = NULL;
+    EGLContext *egl_ctx = NULL;
 
     if (es_version) {
+        if (!ra_gl_ctx_test_version(ctx, MPGL_VER(es_version, 0), true))
+            return false;
+
         EGLint attrs[] = {
             EGL_CONTEXT_CLIENT_VERSION, es_version,
             EGL_NONE
         };
 
-        ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+        egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
     } else {
         for (int n = 0; mpgl_preferred_gl_versions[n]; n++) {
             int ver = mpgl_preferred_gl_versions[n];
+            if (!ra_gl_ctx_test_version(ctx, ver, false))
+                continue;
 
             EGLint attrs[] = {
                 EGL_CONTEXT_MAJOR_VERSION, MPGL_VER_GET_MAJOR(ver),
@@ -124,25 +138,25 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
                 EGL_NONE
             };
 
-            ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
-            if (ctx)
+            egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+            if (egl_ctx)
                 break;
         }
 
-        if (!ctx) {
+        if (!egl_ctx && ra_gl_ctx_test_version(ctx, 140, false)) {
             // Fallback for EGL 1.4 without EGL_KHR_create_context.
             EGLint attrs[] = { EGL_NONE };
 
-            ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
+            egl_ctx = eglCreateContext(display, config, EGL_NO_CONTEXT, attrs);
         }
     }
 
-    if (!ctx) {
-        mp_msg(log, msgl, "Could not create EGL context!\n");
+    if (!egl_ctx) {
+        MP_MSG(ctx, msgl, "Could not create EGL context!\n");
         return false;
     }
 
-    *out_context = ctx;
+    *out_context = egl_ctx;
     *out_config = config;
     return true;
 }
@@ -152,56 +166,36 @@ static bool create_context(EGLDisplay display, struct mp_log *log, bool probing,
 // Create a context and return it and the config it was created with. If it
 // returns false, the out_* pointers are set to NULL.
 // vo_flags is a combination of VOFLAG_* values.
-bool mpegl_create_context(EGLDisplay display, struct mp_log *log, int vo_flags,
+bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
                           EGLContext *out_context, EGLConfig *out_config)
 {
-    return mpegl_create_context_opts(display, log,
-        &(struct mpegl_opts){.vo_flags = vo_flags}, out_context, out_config);
+    return mpegl_create_context_cb(ctx, display, (struct mpegl_cb){0},
+                                   out_context, out_config);
 }
 
 // Create a context and return it and the config it was created with. If it
 // returns false, the out_* pointers are set to NULL.
-bool mpegl_create_context_opts(EGLDisplay display, struct mp_log *log,
-                               struct mpegl_opts *opts,
-                               EGLContext *out_context, EGLConfig *out_config)
+bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
+                             struct mpegl_cb cb, EGLContext *out_context,
+                             EGLConfig *out_config)
 {
-    assert(opts);
-
     *out_context = NULL;
     *out_config = NULL;
 
     const char *version = eglQueryString(display, EGL_VERSION);
     const char *vendor = eglQueryString(display, EGL_VENDOR);
     const char *apis = eglQueryString(display, EGL_CLIENT_APIS);
-    mp_verbose(log, "EGL_VERSION=%s\nEGL_VENDOR=%s\nEGL_CLIENT_APIS=%s\n",
+    MP_VERBOSE(ctx, "EGL_VERSION=%s\nEGL_VENDOR=%s\nEGL_CLIENT_APIS=%s\n",
                STR_OR_ERR(version), STR_OR_ERR(vendor), STR_OR_ERR(apis));
 
-    bool probing = opts->vo_flags & VOFLAG_PROBING;
-    int msgl = probing ? MSGL_V : MSGL_FATAL;
-    bool try_gles = !(opts->vo_flags & VOFLAG_NO_GLES);
-
-    if (!(opts->vo_flags & VOFLAG_GLES)) {
-        // Desktop OpenGL
-        if (create_context(display, log, try_gles | probing, 0, opts,
-                           out_context, out_config))
-            return true;
-    }
-
-    if (try_gles && !(opts->vo_flags & VOFLAG_GLES2)) {
-        // ES 3.x
-        if (create_context(display, log, true, 3, opts,
-                           out_context, out_config))
-            return true;
-    }
-
-    if (try_gles) {
-        // ES 2.0
-        if (create_context(display, log, probing, 2, opts,
-                           out_context, out_config))
+    int es[] = {0, 3, 2}; // preference order
+    for (int i = 0; i < MP_ARRAY_SIZE(es); i++) {
+        if (create_context(ctx, display, es[i], cb, out_context, out_config))
             return true;
     }
 
-    mp_msg(log, msgl, "Could not create a GL context.\n");
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR;
+    MP_MSG(ctx, msgl, "Could not create a GL context.\n");
     return false;
 }
 
diff --git a/video/out/opengl/egl_helpers.h b/video/out/opengl/egl_helpers.h
index 05f9dcc..eaaf9d7 100644
--- a/video/out/opengl/egl_helpers.h
+++ b/video/out/opengl/egl_helpers.h
@@ -6,26 +6,23 @@
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
+#include "video/out/gpu/context.h"
+
 struct mp_log;
 
-bool mpegl_create_context(EGLDisplay display, struct mp_log *log, int vo_flags,
+bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
                           EGLContext *out_context, EGLConfig *out_config);
 
-struct mpegl_opts {
-    // combination of VOFLAG_* values.
-    int vo_flags;
-
-    // for callbacks
-    void *user_data;
-
+struct mpegl_cb {
     // if set, pick the desired config from the given list and return its index
     // defaults to 0 (they are sorted by eglChooseConfig)
     int (*refine_config)(void *user_data, EGLConfig *configs, int num_configs);
+    void *user_data;
 };
 
-bool mpegl_create_context_opts(EGLDisplay display, struct mp_log *log,
-                               struct mpegl_opts *opts,
-                               EGLContext *out_context, EGLConfig *out_config);
+bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
+                             struct mpegl_cb cb, EGLContext *out_context,
+                             EGLConfig *out_config);
 
 struct GL;
 void mpegl_load_functions(struct GL *gl, struct mp_log *log);
diff --git a/video/out/opengl/formats.h b/video/out/opengl/formats.h
index 3da6ede..f727a3b 100644
--- a/video/out/opengl/formats.h
+++ b/video/out/opengl/formats.h
@@ -2,7 +2,6 @@
 #define MPGL_FORMATS_H_
 
 #include "common.h"
-#include "ra.h"
 
 struct gl_format {
     const char *name;           // symbolic name for user interaction/debugging
diff --git a/video/out/opengl/gl_utils.c b/video/out/opengl/gl_utils.c
deleted file mode 100644
index bce2dab..0000000
--- a/video/out/opengl/gl_utils.c
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * This file is part of mpv.
- * Parts based on MPlayer code by Reimar Döffinger.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdarg.h>
-#include <assert.h>
-
-#include <libavutil/sha.h>
-#include <libavutil/intreadwrite.h>
-#include <libavutil/mem.h>
-
-#include "osdep/io.h"
-
-#include "common/common.h"
-#include "options/path.h"
-#include "stream/stream.h"
-#include "formats.h"
-#include "ra_gl.h"
-#include "gl_utils.h"
-
-// GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
-static const char *gl_error_to_string(GLenum error)
-{
-    switch (error) {
-    case GL_INVALID_ENUM: return "INVALID_ENUM";
-    case GL_INVALID_VALUE: return "INVALID_VALUE";
-    case GL_INVALID_OPERATION: return "INVALID_OPERATION";
-    case GL_INVALID_FRAMEBUFFER_OPERATION: return "INVALID_FRAMEBUFFER_OPERATION";
-    case GL_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
-    default: return "unknown";
-    }
-}
-
-void gl_check_error(GL *gl, struct mp_log *log, const char *info)
-{
-    for (;;) {
-        GLenum error = gl->GetError();
-        if (error == GL_NO_ERROR)
-            break;
-        mp_msg(log, MSGL_ERR, "%s: OpenGL error %s.\n", info,
-               gl_error_to_string(error));
-    }
-}
-
-static int get_alignment(int stride)
-{
-    if (stride % 8 == 0)
-        return 8;
-    if (stride % 4 == 0)
-        return 4;
-    if (stride % 2 == 0)
-        return 2;
-    return 1;
-}
-
-// upload a texture, handling things like stride and slices
-//  target: texture target, usually GL_TEXTURE_2D
-//  format, type: texture parameters
-//  dataptr, stride: image data
-//  x, y, width, height: part of the image to upload
-void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
-                   const void *dataptr, int stride,
-                   int x, int y, int w, int h)
-{
-    int bpp = gl_bytes_per_pixel(format, type);
-    const uint8_t *data = dataptr;
-    int y_max = y + h;
-    if (w <= 0 || h <= 0 || !bpp)
-        return;
-    if (stride < 0) {
-        data += (h - 1) * stride;
-        stride = -stride;
-    }
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
-    int slice = h;
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
-        // this is not always correct, but should work for MPlayer
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
-    } else {
-        if (stride != bpp * w)
-            slice = 1; // very inefficient, but at least it works
-    }
-    for (; y + slice <= y_max; y += slice) {
-        gl->TexSubImage2D(target, 0, x, y, w, slice, format, type, data);
-        data += stride * slice;
-    }
-    if (y < y_max)
-        gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
-}
-
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
-{
-    if (gl->es)
-        return NULL; // ES can't read from front buffer
-    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
-    if (!image)
-        return NULL;
-    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
-    GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
-    gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
-    gl->ReadBuffer(obj);
-    //flip image while reading (and also avoid stride-related trouble)
-    for (int y = 0; y < h; y++) {
-        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
-                       image->planes[0] + y * image->stride[0]);
-    }
-    gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-    return image;
-}
-
-static void gl_vao_enable_attribs(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    for (int n = 0; n < vao->num_entries; n++) {
-        const struct ra_renderpass_input *e = &vao->entries[n];
-        GLenum type = 0;
-        bool normalized = false;
-        switch (e->type) {
-        case RA_VARTYPE_INT:
-            type = GL_INT;
-            break;
-        case RA_VARTYPE_FLOAT:
-            type = GL_FLOAT;
-            break;
-        case RA_VARTYPE_BYTE_UNORM:
-            type = GL_UNSIGNED_BYTE;
-            normalized = true;
-            break;
-        default:
-            abort();
-        }
-        assert(e->dim_m == 1);
-
-        gl->EnableVertexAttribArray(n);
-        gl->VertexAttribPointer(n, e->dim_v, type, normalized,
-                                vao->stride, (void *)(intptr_t)e->offset);
-    }
-}
-
-void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
-                 const struct ra_renderpass_input *entries,
-                 int num_entries)
-{
-    assert(!vao->vao);
-    assert(!vao->buffer);
-
-    *vao = (struct gl_vao){
-        .gl = gl,
-        .stride = stride,
-        .entries = entries,
-        .num_entries = num_entries,
-    };
-
-    gl->GenBuffers(1, &vao->buffer);
-
-    if (gl->BindVertexArray) {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-
-        gl->GenVertexArrays(1, &vao->vao);
-        gl->BindVertexArray(vao->vao);
-        gl_vao_enable_attribs(vao);
-        gl->BindVertexArray(0);
-
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-}
-
-void gl_vao_uninit(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-    if (!gl)
-        return;
-
-    if (gl->DeleteVertexArrays)
-        gl->DeleteVertexArrays(1, &vao->vao);
-    gl->DeleteBuffers(1, &vao->buffer);
-
-    *vao = (struct gl_vao){0};
-}
-
-static void gl_vao_bind(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    if (gl->BindVertexArray) {
-        gl->BindVertexArray(vao->vao);
-    } else {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-        gl_vao_enable_attribs(vao);
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-}
-
-static void gl_vao_unbind(struct gl_vao *vao)
-{
-    GL *gl = vao->gl;
-
-    if (gl->BindVertexArray) {
-        gl->BindVertexArray(0);
-    } else {
-        for (int n = 0; n < vao->num_entries; n++)
-            gl->DisableVertexAttribArray(n);
-    }
-}
-
-// Draw the vertex data (as described by the gl_vao_entry entries) in ptr
-// to the screen. num is the number of vertexes. prim is usually GL_TRIANGLES.
-// If ptr is NULL, then skip the upload, and use the data uploaded with the
-// previous call.
-void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
-{
-    GL *gl = vao->gl;
-
-    if (ptr) {
-        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
-        gl->BufferData(GL_ARRAY_BUFFER, num * vao->stride, ptr, GL_STREAM_DRAW);
-        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
-    }
-
-    gl_vao_bind(vao);
-
-    gl->DrawArrays(prim, 0, num);
-
-    gl_vao_unbind(vao);
-}
-
-static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
-                                   GLenum severity, GLsizei length,
-                                   const GLchar *message, const void *userParam)
-{
-    // keep in mind that the debug callback can be asynchronous
-    struct mp_log *log = (void *)userParam;
-    int level = MSGL_ERR;
-    switch (severity) {
-    case GL_DEBUG_SEVERITY_NOTIFICATION:level = MSGL_V; break;
-    case GL_DEBUG_SEVERITY_LOW:         level = MSGL_INFO; break;
-    case GL_DEBUG_SEVERITY_MEDIUM:      level = MSGL_WARN; break;
-    case GL_DEBUG_SEVERITY_HIGH:        level = MSGL_ERR; break;
-    }
-    mp_msg(log, level, "GL: %s\n", message);
-}
-
-void gl_set_debug_logger(GL *gl, struct mp_log *log)
-{
-    if (gl->DebugMessageCallback)
-        gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
-}
-
-int gl_get_fb_depth(GL *gl, int fbo)
-{
-    if ((gl->es < 300 && !gl->version) || !(gl->mpgl_caps & MPGL_CAP_FB))
-        return -1;
-
-    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
-
-    GLenum obj = gl->version ? GL_BACK_LEFT : GL_BACK;
-    if (fbo)
-        obj = GL_COLOR_ATTACHMENT0;
-
-    GLint depth_g = -1;
-
-    gl->GetFramebufferAttachmentParameteriv(GL_FRAMEBUFFER, obj,
-                            GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE, &depth_g);
-
-    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-
-    return depth_g > 0 ? depth_g : -1;
-}
diff --git a/video/out/opengl/gl_utils.h b/video/out/opengl/gl_utils.h
deleted file mode 100644
index 306ee23..0000000
--- a/video/out/opengl/gl_utils.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * This file is part of mpv.
- * Parts based on MPlayer code by Reimar Döffinger.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MP_GL_UTILS_
-#define MP_GL_UTILS_
-
-#include <math.h>
-
-#include "common.h"
-#include "ra.h"
-
-struct mp_log;
-
-void gl_check_error(GL *gl, struct mp_log *log, const char *info);
-
-void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
-                   const void *dataptr, int stride,
-                   int x, int y, int w, int h);
-
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
-
-struct gl_vao {
-    GL *gl;
-    GLuint vao;     // the VAO object, or 0 if unsupported by driver
-    GLuint buffer;  // GL_ARRAY_BUFFER used for the data
-    int stride;     // size of each element (interleaved elements are assumed)
-    const struct ra_renderpass_input *entries;
-    int num_entries;
-};
-
-void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
-                 const struct ra_renderpass_input *entries,
-                 int num_entries);
-void gl_vao_uninit(struct gl_vao *vao);
-void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
-
-void gl_set_debug_logger(GL *gl, struct mp_log *log);
-
-int gl_get_fb_depth(GL *gl, int fbo);
-
-#endif
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index d40bafe..1a7df20 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -32,11 +32,10 @@
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
 
+#include "video/out/gpu/hwdec.h"
 #include "formats.h"
-#include "hwdec.h"
 #include "options/m_config.h"
 #include "ra_gl.h"
-#include "video.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
@@ -161,11 +160,9 @@ static int cuda_init(struct ra_hwdec *hw)
         goto error;
 
     p->hwctx = (struct mp_hwdec_ctx) {
-        .type = HWDEC_CUDA,
-        .ctx = p->decode_ctx,
+        .driver_name = hw->driver->name,
         .av_device_ref = hw_device_ctx,
     };
-    p->hwctx.driver_name = hw->driver->name;
     hwdec_devices_add(hw->devs, &p->hwctx);
     return 0;
 
@@ -180,8 +177,7 @@ static void cuda_uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
     av_buffer_unref(&p->hwctx.av_device_ref);
 
     if (p->decode_ctx && p->decode_ctx != p->display_ctx)
@@ -327,8 +323,7 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
 }
 
 const struct ra_hwdec_driver ra_hwdec_cuda = {
-    .name = "cuda",
-    .api = HWDEC_CUDA,
+    .name = "cuda-nvdec",
     .imgfmts = {IMGFMT_CUDA, 0},
     .priv_size = sizeof(struct priv_owner),
     .init = cuda_init,
diff --git a/video/out/opengl/hwdec_d3d11egl.c b/video/out/opengl/hwdec_d3d11egl.c
index 3988f83..e741633 100644
--- a/video/out/opengl/hwdec_d3d11egl.c
+++ b/video/out/opengl/hwdec_d3d11egl.c
@@ -27,10 +27,10 @@
 #include "common/common.h"
 #include "osdep/timer.h"
 #include "osdep/windows_utils.h"
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/hwdec.h"
-#include "video/decode/d3d.h"
+#include "video/d3d.h"
 
 #ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
 #define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x33AB
@@ -75,8 +75,7 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
 
     if (p->d3d11_device)
         ID3D11Device_Release(p->d3d11_device);
@@ -180,10 +179,7 @@ static int init(struct ra_hwdec *hw)
     ID3D10Multithread_Release(multithread);
 
     p->hwctx = (struct mp_hwdec_ctx){
-        .type = HWDEC_D3D11VA,
         .driver_name = hw->driver->name,
-        .ctx = p->d3d11_device,
-        .download_image = d3d11_download_image,
         .av_device_ref = d3d11_wrap_device_ref(p->d3d11_device),
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
@@ -336,7 +332,6 @@ static void mapper_unmap(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_d3d11egl = {
     .name = "d3d11-egl",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_D3D11VA,
     .imgfmts = {IMGFMT_D3D11NV12, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_d3d11eglrgb.c b/video/out/opengl/hwdec_d3d11eglrgb.c
index fa3976f..c8f6580 100644
--- a/video/out/opengl/hwdec_d3d11eglrgb.c
+++ b/video/out/opengl/hwdec_d3d11eglrgb.c
@@ -27,10 +27,10 @@
 #include "common/common.h"
 #include "osdep/timer.h"
 #include "osdep/windows_utils.h"
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/hwdec.h"
-#include "video/decode/d3d.h"
+#include "video/d3d.h"
 
 #ifndef EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE
 #define EGL_D3D_TEXTURE_SUBRESOURCE_ID_ANGLE 0x3AAB
@@ -54,8 +54,7 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
 
     if (p->d3d11_device)
         ID3D11Device_Release(p->d3d11_device);
@@ -137,9 +136,7 @@ static int init(struct ra_hwdec *hw)
     }
 
     p->hwctx = (struct mp_hwdec_ctx){
-        .type = HWDEC_D3D11VA,
         .driver_name = hw->driver->name,
-        .ctx = p->d3d11_device,
         .av_device_ref = d3d11_wrap_device_ref(p->d3d11_device),
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
@@ -261,7 +258,6 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb = {
     .name = "d3d11-egl-rgb",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_D3D11VA,
     .imgfmts = {IMGFMT_D3D11RGB, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_drmprime_drm.c b/video/out/opengl/hwdec_drmprime_drm.c
new file mode 100644
index 0000000..faa099a
--- /dev/null
+++ b/video/out/opengl/hwdec_drmprime_drm.c
@@ -0,0 +1,268 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdbool.h>
+
+#include <libavutil/hwcontext_drm.h>
+
+#include "common.h"
+#include "video/hwdec.h"
+#include "common/msg.h"
+#include "options/m_config.h"
+#include "libmpv/opengl_cb.h"
+#include "video/out/drm_common.h"
+#include "video/out/drm_prime.h"
+#include "video/out/gpu/hwdec.h"
+#include "video/mp_image.h"
+
+#include "ra_gl.h"
+
+extern const struct m_sub_options drm_conf;
+
+struct drm_frame {
+    struct drm_prime_framebuffer fb;
+    struct mp_image *image; // associated mpv image
+};
+
+struct priv {
+    struct mp_log *log;
+
+    struct mp_image_params params;
+
+    struct drm_atomic_context *ctx;
+    struct drm_frame current_frame, old_frame;
+
+    struct mp_rect src, dst;
+
+    int display_w, display_h;
+};
+
+static void set_current_frame(struct ra_hwdec *hw, struct drm_frame *frame)
+{
+    struct priv *p = hw->priv;
+
+    // frame will be on screen after next vsync
+    // current_frame is currently the displayed frame and will be replaced
+    // by frame after next vsync.
+    // We used old frame as triple buffering to make sure that the drm framebuffer
+    // is not being displayed when we release it.
+
+    if (p->ctx) {
+        drm_prime_destroy_framebuffer(p->log, p->ctx->fd, &p->old_frame.fb);
+    }
+
+    mp_image_setrefp(&p->old_frame.image, p->current_frame.image);
+    p->old_frame.fb = p->current_frame.fb;
+
+    if (frame) {
+        p->current_frame.fb = frame->fb;
+        mp_image_setrefp(&p->current_frame.image, frame->image);
+    } else {
+        memset(&p->current_frame.fb, 0, sizeof(p->current_frame.fb));
+        mp_image_setrefp(&p->current_frame.image, NULL);
+    }
+}
+
+static void scale_dst_rect(struct ra_hwdec *hw, int source_w, int source_h ,struct mp_rect *src, struct mp_rect *dst)
+{
+    struct priv *p = hw->priv;
+    double hratio, vratio, ratio;
+
+    // drm can allow to have a layer that has a different size from framebuffer
+    // we scale here the destination size to video mode
+    hratio = vratio = ratio = 1.0;
+
+    hratio = (double)p->display_w / (double)source_w;
+    vratio = (double)p->display_h / (double)source_h;
+    ratio = hratio <= vratio ? hratio : vratio;
+
+    dst->x0 = src->x0 * ratio;
+    dst->x1 = src->x1 * ratio;
+    dst->y0 = src->y0 * ratio;
+    dst->y1 = src->y1 * ratio;
+
+    int offset_x = (p->display_w - ratio * source_w) / 2;
+    int offset_y = (p->display_h - ratio * source_h) / 2;
+
+    dst->x0 += offset_x;
+    dst->x1 += offset_x;
+    dst->y0 += offset_y;
+    dst->y1 += offset_y;
+}
+
+static int overlay_frame(struct ra_hwdec *hw, struct mp_image *hw_image,
+                         struct mp_rect *src, struct mp_rect *dst, bool newframe)
+{
+    struct priv *p = hw->priv;
+    GL *gl = ra_gl_get(hw->ra);
+    AVDRMFrameDescriptor *desc = NULL;
+    drmModeAtomicReq *request = NULL;
+    struct drm_frame next_frame = {0};
+    int ret;
+
+    if (hw_image) {
+
+        // grab opengl-cb windowing info to eventually upscale the overlay
+        // as egl windows could be upscaled to primary plane.
+        struct mpv_opengl_cb_window_pos *glparams =
+                gl ? (struct mpv_opengl_cb_window_pos *)
+                mpgl_get_native_display(gl, "opengl-cb-window-pos") : NULL;
+        if (glparams) {
+            scale_dst_rect(hw, glparams->width, glparams->height, dst, &p->dst);
+        } else {
+            p->dst = *dst;
+        }
+        p->src = *src;
+
+        // grab drm interop info
+        struct mpv_opengl_cb_drm_params *drmparams =
+                gl ? (struct mpv_opengl_cb_drm_params *)
+                mpgl_get_native_display(gl, "opengl-cb-drm-params") : NULL;
+        if (drmparams)
+            request = (drmModeAtomicReq *)drmparams->atomic_request;
+
+        next_frame.image = hw_image;
+        desc = (AVDRMFrameDescriptor *)hw_image->planes[0];
+
+        if (desc) {
+            int srcw = p->src.x1 - p->src.x0;
+            int srch = p->src.y1 - p->src.y0;
+            int dstw = MP_ALIGN_UP(p->dst.x1 - p->dst.x0, 2);
+            int dsth = MP_ALIGN_UP(p->dst.y1 - p->dst.y0, 2);
+
+            if (drm_prime_create_framebuffer(p->log, p->ctx->fd, desc, srcw, srch, &next_frame.fb)) {
+                ret = -1;
+                goto fail;
+            }
+
+            if (request) {
+                drm_object_set_property(request, p->ctx->overlay_plane, "FB_ID", next_frame.fb.fb_id);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_ID", p->ctx->crtc->id);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_X",   p->src.x0 << 16);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_Y",   p->src.y0 << 16);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_W",   srcw << 16);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_H",   srch << 16);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_X",  MP_ALIGN_DOWN(p->dst.x0, 2));
+                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_Y",  MP_ALIGN_DOWN(p->dst.y0, 2));
+                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_W",  dstw);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_H",  dsth);
+                drm_object_set_property(request,  p->ctx->overlay_plane, "ZPOS",    0);
+            } else {
+                ret = drmModeSetPlane(p->ctx->fd, p->ctx->overlay_plane->id, p->ctx->crtc->id, next_frame.fb.fb_id, 0,
+                                      MP_ALIGN_DOWN(p->dst.x0, 2), MP_ALIGN_DOWN(p->dst.y0, 2), dstw, dsth,
+                                      p->src.x0 << 16, p->src.y0 << 16 , srcw << 16, srch << 16);
+                if (ret < 0) {
+                    MP_ERR(hw, "Failed to set the plane %d (buffer %d).\n", p->ctx->overlay_plane->id,
+                                next_frame.fb.fb_id);
+                    goto fail;
+                }
+            }
+        }
+    }
+
+    set_current_frame(hw, &next_frame);
+    return 0;
+
+ fail:
+    drm_prime_destroy_framebuffer(p->log, p->ctx->fd, &next_frame.fb);
+    return ret;
+}
+
+static void uninit(struct ra_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+
+    set_current_frame(hw, NULL);
+
+    if (p->ctx) {
+        drm_atomic_destroy_context(p->ctx);
+        p->ctx = NULL;
+    }
+}
+
+static int init(struct ra_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    int drm_overlay;
+
+    if (!ra_is_gl(hw->ra))
+        return -1;
+
+    p->log = hw->log;
+
+    void *tmp = talloc_new(NULL);
+    struct drm_opts *opts = mp_get_config_group(tmp, hw->global, &drm_conf);
+    drm_overlay = opts->drm_overlay_id;
+    talloc_free(tmp);
+
+    GL *gl = ra_gl_get(hw->ra);
+    struct mpv_opengl_cb_drm_params *params =
+            gl ? (struct mpv_opengl_cb_drm_params *)
+            mpgl_get_native_display(gl, "opengl-cb-drm-params") : NULL;
+    if (!params) {
+        MP_VERBOSE(hw, "Could not get drm interop info.\n");
+        goto err;
+    }
+
+    if (params->fd) {
+        p->ctx = drm_atomic_create_context(p->log, params->fd, params->crtc_id,
+                                           drm_overlay);
+        if (!p->ctx) {
+            mp_err(p->log, "Failed to retrieve DRM atomic context.\n");
+            goto err;
+        }
+    } else {
+        mp_err(p->log, "Failed to retrieve DRM fd from native display.\n");
+        goto err;
+    }
+
+    drmModeCrtcPtr crtc;
+    crtc = drmModeGetCrtc(p->ctx->fd, p->ctx->crtc->id);
+    if (crtc) {
+        p->display_w = crtc->mode.hdisplay;
+        p->display_h = crtc->mode.vdisplay;
+        drmModeFreeCrtc(crtc);
+    }
+
+
+    uint64_t has_prime;
+    if (drmGetCap(p->ctx->fd, DRM_CAP_PRIME, &has_prime) < 0) {
+        MP_ERR(hw, "Card does not support prime handles.\n");
+        goto err;
+    }
+
+    return 0;
+
+err:
+    uninit(hw);
+    return -1;
+}
+
+const struct ra_hwdec_driver ra_hwdec_drmprime_drm = {
+    .name = "drmprime-drm",
+    .priv_size = sizeof(struct priv),
+    .imgfmts = {IMGFMT_DRMPRIME, 0},
+    .init = init,
+    .overlay_frame = overlay_frame,
+    .uninit = uninit,
+};
diff --git a/video/out/opengl/hwdec_dxva2egl.c b/video/out/opengl/hwdec_dxva2egl.c
index 01fb482..0f8a4ad 100644
--- a/video/out/opengl/hwdec_dxva2egl.c
+++ b/video/out/opengl/hwdec_dxva2egl.c
@@ -27,10 +27,10 @@
 #include "common/common.h"
 #include "osdep/timer.h"
 #include "osdep/windows_utils.h"
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/hwdec.h"
-#include "video/decode/d3d.h"
+#include "video/d3d.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
@@ -58,8 +58,8 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+    av_buffer_unref(&p->hwctx.av_device_ref);
 
     if (p->device9ex)
         IDirect3DDevice9Ex_Release(p->device9ex);
@@ -180,9 +180,7 @@ static int init(struct ra_hwdec *hw)
     ra_hwdec_mapper_free(&mapper);
 
     p->hwctx = (struct mp_hwdec_ctx){
-        .type = HWDEC_DXVA2,
         .driver_name = hw->driver->name,
-        .ctx = (IDirect3DDevice9 *)p->device9ex,
         .av_device_ref = d3d9_wrap_device_ref((IDirect3DDevice9 *)p->device9ex),
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
@@ -368,7 +366,6 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_dxva2egl = {
     .name = "dxva2-egl",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_DXVA2,
     .imgfmts = {IMGFMT_DXVA2, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_dxva2gldx.c b/video/out/opengl/hwdec_dxva2gldx.c
index fd9c80b..984fd7f 100644
--- a/video/out/opengl/hwdec_dxva2gldx.c
+++ b/video/out/opengl/hwdec_dxva2gldx.c
@@ -20,10 +20,10 @@
 
 #include "common/common.h"
 #include "osdep/windows_utils.h"
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/hwdec.h"
-#include "video/decode/d3d.h"
+#include "video/d3d.h"
 
 // for  WGL_ACCESS_READ_ONLY_NV
 #include <GL/wglext.h>
@@ -48,8 +48,8 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+    av_buffer_unref(&p->hwctx.av_device_ref);
 
     if (p->device)
         IDirect3DDevice9Ex_Release(p->device);
@@ -78,9 +78,7 @@ static int init(struct ra_hwdec *hw)
     IDirect3DDevice9Ex_AddRef(p->device);
 
     p->hwctx = (struct mp_hwdec_ctx){
-        .type = HWDEC_DXVA2,
         .driver_name = hw->driver->name,
-        .ctx = (IDirect3DDevice9 *)p->device,
         .av_device_ref = d3d9_wrap_device_ref((IDirect3DDevice9 *)p->device),
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
@@ -229,7 +227,6 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_dxva2gldx = {
     .name = "dxva2-dxinterop",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_DXVA2,
     .imgfmts = {IMGFMT_DXVA2, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_ios.m b/video/out/opengl/hwdec_ios.m
index 8e020de..b8d4876 100644
--- a/video/out/opengl/hwdec_ios.m
+++ b/video/out/opengl/hwdec_ios.m
@@ -27,10 +27,9 @@
 
 #include "config.h"
 
+#include "video/out/gpu/hwdec.h"
 #include "video/mp_image_pool.h"
-#include "video/vt.h"
 #include "ra_gl.h"
-#include "hwdec.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
@@ -70,15 +69,11 @@ static int init(struct ra_hwdec *hw)
         return -1;
 
     p->hwctx = (struct mp_hwdec_ctx){
-        .type = HWDEC_VIDEOTOOLBOX,
-        .download_image = mp_vt_download_image,
-        .ctx = &p->hwctx,
+        .driver_name = hw->driver->name,
     };
 
-#if HAVE_VIDEOTOOLBOX_HWACCEL_NEW
     av_hwdevice_ctx_create(&p->hwctx.av_device_ref, AV_HWDEVICE_TYPE_VIDEOTOOLBOX,
                            NULL, NULL, 0);
-#endif
 
     hwdec_devices_add(hw->devs, &p->hwctx);
 
@@ -89,8 +84,7 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
     av_buffer_unref(&p->hwctx.av_device_ref);
 }
 
@@ -132,7 +126,6 @@ static const struct ra_format *find_la_variant(struct ra *ra,
 static int mapper_init(struct ra_hwdec_mapper *mapper)
 {
     struct priv *p = mapper->priv;
-    GL *gl = ra_gl_get(mapper->ra);
 
     mapper->dst_params = mapper->src_params;
     mapper->dst_params.imgfmt = mapper->src_params.hw_subfmt;
@@ -243,8 +236,11 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
             .src_linear = true,
         };
 
-        mapper->tex[i] = ra_create_wrapped_tex(mapper->ra, &params,
-                                               p->gl_planes[i]);
+        mapper->tex[i] = ra_create_wrapped_tex(
+            mapper->ra,
+            &params,
+            CVOpenGLESTextureGetName(p->gl_planes[i])
+        );
         if (!mapper->tex[i])
             return -1;
     }
@@ -264,7 +260,6 @@ static void mapper_uninit(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_videotoolbox = {
     .name = "videotoolbox",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_VIDEOTOOLBOX,
     .imgfmts = {IMGFMT_VIDEOTOOLBOX, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_osx.c b/video/out/opengl/hwdec_osx.c
index 348a5e1..ca7a004 100644
--- a/video/out/opengl/hwdec_osx.c
+++ b/video/out/opengl/hwdec_osx.c
@@ -29,9 +29,8 @@
 #include "config.h"
 
 #include "video/mp_image_pool.h"
-#include "video/vt.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
-#include "hwdec.h"
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
@@ -71,15 +70,11 @@ static int init(struct ra_hwdec *hw)
         return -1;
 
     p->hwctx = (struct mp_hwdec_ctx){
-        .type = HWDEC_VIDEOTOOLBOX,
-        .download_image = mp_vt_download_image,
-        .ctx = &p->hwctx,
+        .driver_name = hw->driver->name,
     };
 
-#if HAVE_VIDEOTOOLBOX_HWACCEL_NEW
     av_hwdevice_ctx_create(&p->hwctx.av_device_ref, AV_HWDEVICE_TYPE_VIDEOTOOLBOX,
                            NULL, NULL, 0);
-#endif
 
     hwdec_devices_add(hw->devs, &p->hwctx);
 
@@ -90,8 +85,7 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
 
-    if (p->hwctx.ctx)
-        hwdec_devices_remove(hw->devs, &p->hwctx);
+    hwdec_devices_remove(hw->devs, &p->hwctx);
     av_buffer_unref(&p->hwctx.av_device_ref);
 }
 
@@ -214,7 +208,6 @@ static void mapper_uninit(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_videotoolbox = {
     .name = "videotoolbox",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_VIDEOTOOLBOX,
     .imgfmts = {IMGFMT_VIDEOTOOLBOX, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_rpi.c b/video/out/opengl/hwdec_rpi.c
index 6f39c3e..6c080f1 100644
--- a/video/out/opengl/hwdec_rpi.c
+++ b/video/out/opengl/hwdec_rpi.c
@@ -33,8 +33,8 @@
 #include "common/common.h"
 #include "common/msg.h"
 #include "video/mp_image.h"
+#include "video/out/gpu/hwdec.h"
 
-#include "hwdec.h"
 #include "common.h"
 #include "ra_gl.h"
 
@@ -378,7 +378,6 @@ static int create(struct ra_hwdec *hw)
 
 const struct ra_hwdec_driver ra_hwdec_rpi_overlay = {
     .name = "rpi-overlay",
-    .api = HWDEC_RPI,
     .priv_size = sizeof(struct priv),
     .imgfmts = {IMGFMT_MMAL, IMGFMT_420P, 0},
     .init = create,
diff --git a/video/out/opengl/hwdec_vaegl.c b/video/out/opengl/hwdec_vaegl.c
index a0e3222..b4587c5 100644
--- a/video/out/opengl/hwdec_vaegl.c
+++ b/video/out/opengl/hwdec_vaegl.c
@@ -18,6 +18,7 @@
 #include <stddef.h>
 #include <string.h>
 #include <assert.h>
+#include <unistd.h>
 
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
@@ -30,9 +31,9 @@
 
 #include "config.h"
 
-#include "hwdec.h"
-#include "video/vaapi.h"
+#include "video/out/gpu/hwdec.h"
 #include "video/mp_image_pool.h"
+#include "video/vaapi.h"
 #include "common.h"
 #include "ra_gl.h"
 
@@ -127,6 +128,11 @@ struct priv {
     EGLImageKHR images[4];
     VAImage current_image;
     bool buffer_acquired;
+#if VA_CHECK_VERSION(1, 1, 0)
+    bool esh_not_implemented;
+    VADRMPRIMESurfaceDescriptor desc;
+    bool surface_acquired;
+#endif
 
     EGLImageKHR (EGLAPIENTRY *CreateImageKHR)(EGLDisplay, EGLContext,
                                               EGLenum, EGLClientBuffer,
@@ -209,6 +215,14 @@ static void mapper_unmap(struct ra_hwdec_mapper *mapper)
         p->images[n] = 0;
     }
 
+#if VA_CHECK_VERSION(1, 1, 0)
+    if (p->surface_acquired) {
+        for (int n = 0; n < p->desc.num_objects; n++)
+            close(p->desc.objects[n].fd);
+        p->surface_acquired = false;
+    }
+#endif
+
     if (p->buffer_acquired) {
         status = vaReleaseBufferHandle(display, p->current_image.buf);
         CHECK_VA_STATUS(mapper, "vaReleaseBufferHandle()");
@@ -330,6 +344,72 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
     VAImage *va_image = &p->current_image;
     VADisplay *display = p_owner->display;
 
+#if VA_CHECK_VERSION(1, 1, 0)
+    if (p->esh_not_implemented)
+        goto esh_failed;
+
+    status = vaExportSurfaceHandle(display, va_surface_id(mapper->src),
+                                   VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+                                   VA_EXPORT_SURFACE_READ_ONLY |
+                                   VA_EXPORT_SURFACE_SEPARATE_LAYERS,
+                                   &p->desc);
+    if (!CHECK_VA_STATUS(mapper, "vaAcquireSurfaceHandle()")) {
+        if (status == VA_STATUS_ERROR_UNIMPLEMENTED)
+            p->esh_not_implemented = true;
+        goto esh_failed;
+    }
+    p->surface_acquired = true;
+
+    for (int n = 0; n < p->num_planes; n++) {
+        int attribs[20] = {EGL_NONE};
+        int num_attribs = 0;
+
+        ADD_ATTRIB(EGL_LINUX_DRM_FOURCC_EXT, p->desc.layers[n].drm_format);
+        ADD_ATTRIB(EGL_WIDTH,  p->tex[n]->params.w);
+        ADD_ATTRIB(EGL_HEIGHT, p->tex[n]->params.h);
+
+#define ADD_PLANE_ATTRIBS(plane) do { \
+            ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _FD_EXT, \
+                       p->desc.objects[p->desc.layers[n].object_index[plane]].fd); \
+            ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _OFFSET_EXT, \
+                       p->desc.layers[n].offset[plane]); \
+            ADD_ATTRIB(EGL_DMA_BUF_PLANE ## plane ## _PITCH_EXT, \
+                       p->desc.layers[n].pitch[plane]); \
+        } while (0)
+
+        ADD_PLANE_ATTRIBS(0);
+        if (p->desc.layers[n].num_planes > 1)
+            ADD_PLANE_ATTRIBS(1);
+        if (p->desc.layers[n].num_planes > 2)
+            ADD_PLANE_ATTRIBS(2);
+        if (p->desc.layers[n].num_planes > 3)
+            ADD_PLANE_ATTRIBS(3);
+
+        p->images[n] = p->CreateImageKHR(eglGetCurrentDisplay(),
+            EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, attribs);
+        if (!p->images[n])
+            goto esh_failed;
+
+        gl->BindTexture(GL_TEXTURE_2D, p->gl_textures[n]);
+        p->EGLImageTargetTexture2DOES(GL_TEXTURE_2D, p->images[n]);
+
+        mapper->tex[n] = p->tex[n];
+    }
+    gl->BindTexture(GL_TEXTURE_2D, 0);
+
+    if (p->desc.fourcc == VA_FOURCC_YV12)
+        MPSWAP(struct ra_tex*, mapper->tex[1], mapper->tex[2]);
+
+    return 0;
+
+esh_failed:
+    if (p->surface_acquired) {
+        for (int n = 0; n < p->desc.num_objects; n++)
+            close(p->desc.objects[n].fd);
+        p->surface_acquired = false;
+    }
+#endif
+
     status = vaDeriveImage(display, va_surface_id(mapper->src), va_image);
     if (!CHECK_VA_STATUS(mapper, "vaDeriveImage()"))
         goto err;
@@ -417,7 +497,7 @@ static void determine_working_formats(struct ra_hwdec *hw)
     AVHWFramesConstraints *fc =
             av_hwdevice_get_hwframe_constraints(p->ctx->av_device_ref, NULL);
     if (!fc) {
-        MP_WARN(hw, "failed to retrieve libavutil frame constaints\n");
+        MP_WARN(hw, "failed to retrieve libavutil frame constraints\n");
         goto done;
     }
     for (int n = 0; fc->valid_sw_formats[n] != AV_PIX_FMT_NONE; n++) {
@@ -464,7 +544,6 @@ done:
 const struct ra_hwdec_driver ra_hwdec_vaegl = {
     .name = "vaapi-egl",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_VAAPI,
     .imgfmts = {IMGFMT_VAAPI, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/hwdec_vaglx.c b/video/out/opengl/hwdec_vaglx.c
deleted file mode 100644
index 8db15c4..0000000
--- a/video/out/opengl/hwdec_vaglx.c
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * Parts based on the MPlayer VA-API patch (see vo_vaapi.c).
- *
- * mpv is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stddef.h>
-#include <string.h>
-#include <assert.h>
-
-#include <GL/glx.h>
-#include <va/va_x11.h>
-
-#include "video/out/x11_common.h"
-#include "ra_gl.h"
-#include "hwdec.h"
-#include "video/vaapi.h"
-
-struct priv_owner {
-    struct mp_vaapi_ctx *ctx;
-    VADisplay *display;
-    Display *xdisplay;
-    GLXFBConfig fbc;
-};
-
-struct priv {
-    GLuint gl_texture;
-    Pixmap pixmap;
-    GLXPixmap glxpixmap;
-    void (*glXBindTexImage)(Display *dpy, GLXDrawable draw, int buffer, int *a);
-    void (*glXReleaseTexImage)(Display *dpy, GLXDrawable draw, int buffer);
-};
-
-static void uninit(struct ra_hwdec *hw)
-{
-    struct priv_owner *p = hw->priv;
-    if (p->ctx)
-        hwdec_devices_remove(hw->devs, &p->ctx->hwctx);
-    va_destroy(p->ctx);
-}
-
-static int init(struct ra_hwdec *hw)
-{
-    Display *x11disp = glXGetCurrentDisplay();
-    if (!x11disp || !ra_is_gl(hw->ra))
-        return -1;
-    int x11scr = DefaultScreen(x11disp);
-    struct priv_owner *p = hw->priv;
-    p->xdisplay = x11disp;
-    const char *glxext = glXQueryExtensionsString(x11disp, x11scr);
-    if (!glxext || !strstr(glxext, "GLX_EXT_texture_from_pixmap"))
-        return -1;
-    p->display = vaGetDisplay(x11disp);
-    if (!p->display)
-        return -1;
-    p->ctx = va_initialize(p->display, hw->log, true);
-    if (!p->ctx) {
-        vaTerminate(p->display);
-        return -1;
-    }
-
-    int attribs[] = {
-        GLX_BIND_TO_TEXTURE_RGBA_EXT, True,
-        GLX_DRAWABLE_TYPE, GLX_PIXMAP_BIT,
-        GLX_BIND_TO_TEXTURE_TARGETS_EXT, GLX_TEXTURE_2D_BIT_EXT,
-        GLX_Y_INVERTED_EXT, True,
-        GLX_DOUBLEBUFFER, False,
-        GLX_RED_SIZE, 8,
-        GLX_GREEN_SIZE, 8,
-        GLX_BLUE_SIZE, 8,
-        GLX_ALPHA_SIZE, 0,
-        None
-    };
-
-    int fbcount;
-    GLXFBConfig *fbc = glXChooseFBConfig(x11disp, x11scr, attribs, &fbcount);
-    if (fbcount)
-        p->fbc = fbc[0];
-    if (fbc)
-        XFree(fbc);
-    if (!fbcount) {
-        MP_VERBOSE(hw, "No texture-from-pixmap support.\n");
-        return -1;
-    }
-
-    p->ctx->hwctx.driver_name = hw->driver->name;
-    hwdec_devices_add(hw->devs, &p->ctx->hwctx);
-    return 0;
-}
-
-static int mapper_init(struct ra_hwdec_mapper *mapper)
-{
-    struct priv_owner *p_owner = mapper->owner->priv;
-    struct priv *p = mapper->priv;
-    GL *gl = ra_gl_get(mapper->ra);
-    Display *xdisplay = p_owner->xdisplay;
-
-    p->glXBindTexImage =
-        (void*)glXGetProcAddressARB((void*)"glXBindTexImageEXT");
-    p->glXReleaseTexImage =
-        (void*)glXGetProcAddressARB((void*)"glXReleaseTexImageEXT");
-    if (!p->glXBindTexImage || !p->glXReleaseTexImage)
-        return -1;
-
-    gl->GenTextures(1, &p->gl_texture);
-    gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
-    gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
-    gl->BindTexture(GL_TEXTURE_2D, 0);
-
-    p->pixmap = XCreatePixmap(xdisplay,
-                        RootWindow(xdisplay, DefaultScreen(xdisplay)),
-                        mapper->src_params.w, mapper->src_params.h, 24);
-    if (!p->pixmap) {
-        MP_FATAL(mapper, "could not create pixmap\n");
-        return -1;
-    }
-
-    int attribs[] = {
-        GLX_TEXTURE_TARGET_EXT, GLX_TEXTURE_2D_EXT,
-        GLX_TEXTURE_FORMAT_EXT, GLX_TEXTURE_FORMAT_RGB_EXT,
-        GLX_MIPMAP_TEXTURE_EXT, False,
-        None,
-    };
-    p->glxpixmap = glXCreatePixmap(xdisplay, p_owner->fbc, p->pixmap, attribs);
-
-    gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
-    p->glXBindTexImage(xdisplay, p->glxpixmap, GLX_FRONT_EXT, NULL);
-    gl->BindTexture(GL_TEXTURE_2D, 0);
-
-    struct ra_tex_params params = {
-        .dimensions = 2,
-        .w = mapper->src_params.w,
-        .h = mapper->src_params.h,
-        .d = 1,
-        .format = ra_find_unorm_format(mapper->ra, 1, 4), // unsure
-        .render_src = true,
-        .src_linear = true,
-    };
-    if (!params.format)
-        return -1;
-
-    mapper->tex[0] = ra_create_wrapped_tex(mapper->ra, &params, p->gl_texture);
-    if (!mapper->tex[0])
-        return -1;
-
-    mapper->dst_params = mapper->src_params;
-    mapper->dst_params.imgfmt = IMGFMT_RGB0;
-    mapper->dst_params.hw_subfmt = 0;
-
-    return 0;
-}
-
-static void mapper_uninit(struct ra_hwdec_mapper *mapper)
-{
-    struct priv_owner *p_owner = mapper->owner->priv;
-    struct priv *p = mapper->priv;
-    GL *gl = ra_gl_get(mapper->ra);
-    Display *xdisplay = p_owner->xdisplay;
-
-    if (p->glxpixmap) {
-        p->glXReleaseTexImage(xdisplay, p->glxpixmap, GLX_FRONT_EXT);
-        glXDestroyPixmap(xdisplay, p->glxpixmap);
-    }
-    p->glxpixmap = 0;
-
-    if (p->pixmap)
-        XFreePixmap(xdisplay, p->pixmap);
-    p->pixmap = 0;
-
-    ra_tex_free(mapper->ra, &mapper->tex[0]);
-    gl->DeleteTextures(1, &p->gl_texture);
-    p->gl_texture = 0;
-}
-
-static int mapper_map(struct ra_hwdec_mapper *mapper)
-{
-    struct priv_owner *p_owner = mapper->owner->priv;
-    struct priv *p = mapper->priv;
-    VAStatus status;
-
-    struct mp_image *hw_image = mapper->src;
-
-    if (!p->pixmap)
-        return -1;
-
-    status = vaPutSurface(p_owner->display, va_surface_id(hw_image), p->pixmap,
-                          0, 0, hw_image->w, hw_image->h,
-                          0, 0, hw_image->w, hw_image->h,
-                          NULL, 0,
-                          va_get_colorspace_flag(hw_image->params.color.space));
-    CHECK_VA_STATUS(mapper, "vaPutSurface()");
-
-    return 0;
-}
-
-const struct ra_hwdec_driver ra_hwdec_vaglx = {
-    .name = "vaapi-glx",
-    .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_VAAPI,
-    .imgfmts = {IMGFMT_VAAPI, 0},
-    .testing_only = true,
-    .init = init,
-    .uninit = uninit,
-    .mapper = &(const struct ra_hwdec_mapper_driver){
-        .priv_size = sizeof(struct priv),
-        .init = mapper_init,
-        .uninit = mapper_uninit,
-        .map = mapper_map,
-    },
-};
diff --git a/video/out/opengl/hwdec_vdpau.c b/video/out/opengl/hwdec_vdpau.c
index d733650..603a70e 100644
--- a/video/out/opengl/hwdec_vdpau.c
+++ b/video/out/opengl/hwdec_vdpau.c
@@ -20,7 +20,7 @@
 
 #include <GL/glx.h>
 
-#include "hwdec.h"
+#include "video/out/gpu/hwdec.h"
 #include "ra_gl.h"
 #include "video/vdpau.h"
 #include "video/vdpau_mixer.h"
@@ -304,7 +304,6 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_vdpau = {
     .name = "vdpau-glx",
     .priv_size = sizeof(struct priv_owner),
-    .api = HWDEC_VDPAU,
     .imgfmts = {IMGFMT_VDPAU, 0},
     .init = init,
     .uninit = uninit,
diff --git a/video/out/opengl/ra_gl.c b/video/out/opengl/ra_gl.c
index ab5c132..5b03368 100644
--- a/video/out/opengl/ra_gl.c
+++ b/video/out/opengl/ra_gl.c
@@ -96,14 +96,12 @@ static int ra_init_gl(struct ra *ra, GL *gl)
 
     static const int caps_map[][2] = {
         {RA_CAP_DIRECT_UPLOAD,      0},
-        {RA_CAP_SHARED_BINDING,     0},
         {RA_CAP_GLOBAL_UNIFORM,     0},
+        {RA_CAP_FRAGCOORD,          0},
         {RA_CAP_TEX_1D,             MPGL_CAP_1D_TEX},
         {RA_CAP_TEX_3D,             MPGL_CAP_3D_TEX},
         {RA_CAP_COMPUTE,            MPGL_CAP_COMPUTE_SHADER},
         {RA_CAP_NESTED_ARRAY,       MPGL_CAP_NESTED_ARRAY},
-        {RA_CAP_BUF_RO,             MPGL_CAP_UBO},
-        {RA_CAP_BUF_RW,             MPGL_CAP_SSBO},
     };
 
     for (int i = 0; i < MP_ARRAY_SIZE(caps_map); i++) {
@@ -111,6 +109,17 @@ static int ra_init_gl(struct ra *ra, GL *gl)
             ra->caps |= caps_map[i][0];
     }
 
+    if (gl->BindBufferBase) {
+        if (gl->mpgl_caps & MPGL_CAP_UBO)
+            ra->caps |= RA_CAP_BUF_RO;
+        if (gl->mpgl_caps & MPGL_CAP_SSBO)
+            ra->caps |= RA_CAP_BUF_RW;
+    }
+
+    // textureGather is only supported in GLSL 400+
+    if (ra->glsl_version >= 400)
+        ra->caps |= RA_CAP_GATHER;
+
     if (gl->BlitFramebuffer)
         ra->caps |= RA_CAP_BLIT;
 
@@ -175,6 +184,8 @@ static int ra_init_gl(struct ra *ra, GL *gl)
             desc->chroma_w = desc->chroma_h = 1;
         }
 
+        fmt->glsl_format = ra_fmt_glsl_format(fmt);
+
         MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt);
     }
 
@@ -648,6 +659,11 @@ static void gl_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
     gl->BindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
 }
 
+static int gl_desc_namespace(enum ra_vartype type)
+{
+    return type;
+}
+
 static void gl_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass)
 {
     GL *gl = ra_gl_get(ra);
@@ -773,7 +789,7 @@ static GLuint load_program(struct ra *ra, const struct ra_renderpass_params *p,
         GLint status = 0;
         gl->GetProgramiv(prog, GL_LINK_STATUS, &status);
         if (status) {
-            MP_VERBOSE(ra, "Loading binary program succeeded.\n");
+            MP_DBG(ra, "Loading binary program succeeded.\n");
         } else {
             gl->DeleteProgram(prog);
             prog = 0;
@@ -811,7 +827,7 @@ static struct ra_renderpass *gl_renderpass_create(struct ra *ra,
     GL *gl = ra_gl_get(ra);
 
     struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass);
-    pass->params = *ra_render_pass_params_copy(pass, params);
+    pass->params = *ra_renderpass_params_copy(pass, params);
     pass->params.cached_program = (bstr){0};
     struct ra_renderpass_gl *pass_gl = pass->priv =
         talloc_zero(NULL, struct ra_renderpass_gl);
@@ -1097,12 +1113,6 @@ static uint64_t gl_timer_stop(struct ra *ra, ra_timer *ratimer)
     return timer->result;
 }
 
-static void gl_flush(struct ra *ra)
-{
-    GL *gl = ra_gl_get(ra);
-    gl->Flush();
-}
-
 static void gl_debug_marker(struct ra *ra, const char *msg)
 {
     struct ra_gl *p = ra->priv;
@@ -1123,6 +1133,7 @@ static struct ra_fns ra_fns_gl = {
     .clear                  = gl_clear,
     .blit                   = gl_blit,
     .uniform_layout         = std140_layout,
+    .desc_namespace         = gl_desc_namespace,
     .renderpass_create      = gl_renderpass_create,
     .renderpass_destroy     = gl_renderpass_destroy,
     .renderpass_run         = gl_renderpass_run,
@@ -1130,6 +1141,5 @@ static struct ra_fns ra_fns_gl = {
     .timer_destroy          = gl_timer_destroy,
     .timer_start            = gl_timer_start,
     .timer_stop             = gl_timer_stop,
-    .flush                  = gl_flush,
     .debug_marker           = gl_debug_marker,
 };
diff --git a/video/out/opengl/ra_gl.h b/video/out/opengl/ra_gl.h
index e5e09a0..9844977 100644
--- a/video/out/opengl/ra_gl.h
+++ b/video/out/opengl/ra_gl.h
@@ -1,8 +1,7 @@
 #pragma once
 
 #include "common.h"
-#include "ra.h"
-#include "gl_utils.h"
+#include "utils.h"
 
 struct ra *ra_create_gl(GL *gl, struct mp_log *log);
 struct ra_tex *ra_create_wrapped_tex(struct ra *ra,
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index b8fc24a..34f4736 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -1,371 +1,284 @@
-#include "common/msg.h"
-#include "video/out/vo.h"
+/*
+ * This file is part of mpv.
+ * Parts based on MPlayer code by Reimar Döffinger.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+
+#include <libavutil/sha.h>
+#include <libavutil/intreadwrite.h>
+#include <libavutil/mem.h>
+
+#include "osdep/io.h"
+
+#include "common/common.h"
+#include "options/path.h"
+#include "stream/stream.h"
+#include "formats.h"
 #include "utils.h"
 
-// Standard parallel 2D projection, except y1 < y0 means that the coordinate
-// system is flipped, not the projection.
-void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
-                        float y0, float y1)
+// GLU has this as gluErrorString (we don't use GLU, as it is legacy-OpenGL)
+static const char *gl_error_to_string(GLenum error)
 {
-    if (y1 < y0) {
-        float tmp = y0;
-        y0 = tmp - y1;
-        y1 = tmp;
+    switch (error) {
+    case GL_INVALID_ENUM: return "INVALID_ENUM";
+    case GL_INVALID_VALUE: return "INVALID_VALUE";
+    case GL_INVALID_OPERATION: return "INVALID_OPERATION";
+    case GL_INVALID_FRAMEBUFFER_OPERATION: return "INVALID_FRAMEBUFFER_OPERATION";
+    case GL_OUT_OF_MEMORY: return "OUT_OF_MEMORY";
+    default: return "unknown";
     }
-
-    t->m[0][0] = 2.0f / (x1 - x0);
-    t->m[0][1] = 0.0f;
-    t->m[1][0] = 0.0f;
-    t->m[1][1] = 2.0f / (y1 - y0);
-    t->t[0] = -(x1 + x0) / (x1 - x0);
-    t->t[1] = -(y1 + y0) / (y1 - y0);
-}
-
-// Apply the effects of one transformation to another, transforming it in the
-// process. In other words: post-composes t onto x
-void gl_transform_trans(struct gl_transform t, struct gl_transform *x)
-{
-    struct gl_transform xt = *x;
-    x->m[0][0] = t.m[0][0] * xt.m[0][0] + t.m[0][1] * xt.m[1][0];
-    x->m[1][0] = t.m[1][0] * xt.m[0][0] + t.m[1][1] * xt.m[1][0];
-    x->m[0][1] = t.m[0][0] * xt.m[0][1] + t.m[0][1] * xt.m[1][1];
-    x->m[1][1] = t.m[1][0] * xt.m[0][1] + t.m[1][1] * xt.m[1][1];
-    gl_transform_vec(t, &x->t[0], &x->t[1]);
-}
-
-void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo)
-{
-    int y_dir = fbo.flip ? -1 : 1;
-    gl_transform_ortho(t, 0, fbo.tex->params.w, 0, fbo.tex->params.h * y_dir);
 }
 
-void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool)
+void gl_check_error(GL *gl, struct mp_log *log, const char *info)
 {
-    for (int i = 0; i < pool->num_buffers; i++)
-        ra_buf_free(ra, &pool->buffers[i]);
-
-    talloc_free(pool->buffers);
-    *pool = (struct ra_buf_pool){0};
+    for (;;) {
+        GLenum error = gl->GetError();
+        if (error == GL_NO_ERROR)
+            break;
+        mp_msg(log, MSGL_ERR, "%s: OpenGL error %s.\n", info,
+               gl_error_to_string(error));
+    }
 }
 
-static bool ra_buf_params_compatible(const struct ra_buf_params *new,
-                                     const struct ra_buf_params *old)
+static int get_alignment(int stride)
 {
-    return new->type == old->type &&
-           new->size <= old->size &&
-           new->host_mapped  == old->host_mapped &&
-           new->host_mutable == old->host_mutable;
+    if (stride % 8 == 0)
+        return 8;
+    if (stride % 4 == 0)
+        return 4;
+    if (stride % 2 == 0)
+        return 2;
+    return 1;
 }
 
-static bool ra_buf_pool_grow(struct ra *ra, struct ra_buf_pool *pool)
+// upload a texture, handling things like stride and slices
+//  target: texture target, usually GL_TEXTURE_2D
+//  format, type: texture parameters
+//  dataptr, stride: image data
+//  x, y, width, height: part of the image to upload
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h)
 {
-    struct ra_buf *buf = ra_buf_create(ra, &pool->current_params);
-    if (!buf)
-        return false;
-
-    MP_TARRAY_INSERT_AT(NULL, pool->buffers, pool->num_buffers, pool->index, buf);
-    MP_VERBOSE(ra, "Resized buffer pool to size %d\n", pool->num_buffers);
-    return true;
+    int bpp = gl_bytes_per_pixel(format, type);
+    const uint8_t *data = dataptr;
+    int y_max = y + h;
+    if (w <= 0 || h <= 0 || !bpp)
+        return;
+    assert(stride > 0);
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(stride));
+    int slice = h;
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH) {
+        // this is not always correct, but should work for MPlayer
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, stride / bpp);
+    } else {
+        if (stride != bpp * w)
+            slice = 1; // very inefficient, but at least it works
+    }
+    for (; y + slice <= y_max; y += slice) {
+        gl->TexSubImage2D(target, 0, x, y, w, slice, format, type, data);
+        data += stride * slice;
+    }
+    if (y < y_max)
+        gl->TexSubImage2D(target, 0, x, y, w, y_max - y, format, type, data);
+    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
+        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
+    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
 }
 
-struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
-                               const struct ra_buf_params *params)
+mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
 {
-    assert(!params->initial_data);
-
-    if (!ra_buf_params_compatible(params, &pool->current_params)) {
-        ra_buf_pool_uninit(ra, pool);
-        pool->current_params = *params;
-    }
-
-    // Make sure we have at least one buffer available
-    if (!pool->buffers && !ra_buf_pool_grow(ra, pool))
-        return NULL;
-
-    // Make sure the next buffer is available for use
-    if (!ra->fns->buf_poll(ra, pool->buffers[pool->index]) &&
-        !ra_buf_pool_grow(ra, pool))
-    {
+    if (gl->es)
+        return NULL; // ES can't read from front buffer
+    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
+    if (!image)
         return NULL;
+    gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
+    GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
+    gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
+    gl->ReadBuffer(obj);
+    //flip image while reading (and also avoid stride-related trouble)
+    for (int y = 0; y < h; y++) {
+        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
+                       image->planes[0] + y * image->stride[0]);
     }
-
-    struct ra_buf *buf = pool->buffers[pool->index++];
-    pool->index %= pool->num_buffers;
-
-    return buf;
+    gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
+    gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
+    return image;
 }
 
-bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
-                       const struct ra_tex_upload_params *params)
+static void gl_vao_enable_attribs(struct gl_vao *vao)
 {
-    if (params->buf)
-        return ra->fns->tex_upload(ra, params);
-
-    struct ra_tex *tex = params->tex;
-    size_t row_size = tex->params.dimensions == 2 ? params->stride :
-                      tex->params.w * tex->params.format->pixel_size;
-
-    struct ra_buf_params bufparams = {
-        .type = RA_BUF_TYPE_TEX_UPLOAD,
-        .size = row_size * tex->params.h * tex->params.d,
-        .host_mutable = true,
-    };
-
-    struct ra_buf *buf = ra_buf_pool_get(ra, pbo, &bufparams);
-    if (!buf)
-        return false;
-
-    ra->fns->buf_update(ra, buf, 0, params->src, bufparams.size);
-
-    struct ra_tex_upload_params newparams = *params;
-    newparams.buf = buf;
-    newparams.src = NULL;
-
-    return ra->fns->tex_upload(ra, &newparams);
-}
+    GL *gl = vao->gl;
+
+    for (int n = 0; n < vao->num_entries; n++) {
+        const struct ra_renderpass_input *e = &vao->entries[n];
+        GLenum type = 0;
+        bool normalized = false;
+        switch (e->type) {
+        case RA_VARTYPE_INT:
+            type = GL_INT;
+            break;
+        case RA_VARTYPE_FLOAT:
+            type = GL_FLOAT;
+            break;
+        case RA_VARTYPE_BYTE_UNORM:
+            type = GL_UNSIGNED_BYTE;
+            normalized = true;
+            break;
+        default:
+            abort();
+        }
+        assert(e->dim_m == 1);
 
-struct ra_layout std140_layout(struct ra_renderpass_input *inp)
-{
-    size_t el_size = ra_vartype_size(inp->type);
-
-    // std140 packing rules:
-    // 1. The alignment of generic values is their size in bytes
-    // 2. The alignment of vectors is the vector length * the base count, with
-    // the exception of vec3 which is always aligned like vec4
-    // 3. The alignment of arrays is that of the element size rounded up to
-    // the nearest multiple of vec4
-    // 4. Matrices are treated like arrays of vectors
-    // 5. Arrays/matrices are laid out with a stride equal to the alignment
-    size_t size = el_size * inp->dim_v;
-    if (inp->dim_v == 3)
-        size += el_size;
-    if (inp->dim_m > 1)
-        size = MP_ALIGN_UP(size, sizeof(float[4]));
-
-    return (struct ra_layout) {
-        .align  = size,
-        .stride = size,
-        .size   = size * inp->dim_m,
-    };
+        gl->EnableVertexAttribArray(n);
+        gl->VertexAttribPointer(n, e->dim_v, type, normalized,
+                                vao->stride, (void *)(intptr_t)e->offset);
+    }
 }
 
-struct ra_layout std430_layout(struct ra_renderpass_input *inp)
+void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
+                 const struct ra_renderpass_input *entries,
+                 int num_entries)
 {
-    size_t el_size = ra_vartype_size(inp->type);
-
-    // std430 packing rules: like std140, except arrays/matrices are always
-    // "tightly" packed, even arrays/matrices of vec3s
-    size_t align = el_size * inp->dim_v;
-    if (inp->dim_v == 3 && inp->dim_m == 1)
-        align += el_size;
-
-    return (struct ra_layout) {
-        .align  = align,
-        .stride = align,
-        .size   = align * inp->dim_m,
+    assert(!vao->vao);
+    assert(!vao->buffer);
+
+    *vao = (struct gl_vao){
+        .gl = gl,
+        .stride = stride,
+        .entries = entries,
+        .num_entries = num_entries,
     };
-}
-
-// Create a texture and a FBO using the texture as color attachments.
-//  fmt: texture internal format
-// If the parameters are the same as the previous call, do not touch it.
-// flags can be 0, or a combination of FBOTEX_FUZZY_W and FBOTEX_FUZZY_H.
-// Enabling FUZZY for W or H means the w or h does not need to be exact.
-bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
-                   int w, int h, const struct ra_format *fmt, int flags)
-{
-    int lw = w, lh = h;
-
-    if (fbo->tex) {
-        int cw = w, ch = h;
-        int rw = fbo->tex->params.w, rh = fbo->tex->params.h;
-
-        if ((flags & FBOTEX_FUZZY_W) && cw < rw)
-            cw = rw;
-        if ((flags & FBOTEX_FUZZY_H) && ch < rh)
-            ch = rh;
-
-        if (rw == cw && rh == ch && fbo->tex->params.format == fmt)
-            goto done;
-    }
-
-    if (flags & FBOTEX_FUZZY_W)
-        w = MP_ALIGN_UP(w, 256);
-    if (flags & FBOTEX_FUZZY_H)
-        h = MP_ALIGN_UP(h, 256);
-
-    mp_verbose(log, "Create FBO: %dx%d (%dx%d)\n", lw, lh, w, h);
-
-    if (!fmt || !fmt->renderable || !fmt->linear_filter) {
-        mp_err(log, "Format %s not supported.\n", fmt ? fmt->name : "(unset)");
-        return false;
-    }
 
-    fbotex_uninit(fbo);
+    gl->GenBuffers(1, &vao->buffer);
 
-    *fbo = (struct fbotex) {
-        .ra = ra,
-    };
-
-    struct ra_tex_params params = {
-        .dimensions = 2,
-        .w = w,
-        .h = h,
-        .d = 1,
-        .format = fmt,
-        .src_linear = true,
-        .render_src = true,
-        .render_dst = true,
-        .storage_dst = true,
-        .blit_src = true,
-    };
+    if (gl->BindVertexArray) {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
 
-    fbo->tex = ra_tex_create(fbo->ra, &params);
+        gl->GenVertexArrays(1, &vao->vao);
+        gl->BindVertexArray(vao->vao);
+        gl_vao_enable_attribs(vao);
+        gl->BindVertexArray(0);
 
-    if (!fbo->tex) {
-        mp_err(log, "Error: framebuffer could not be created.\n");
-        fbotex_uninit(fbo);
-        return false;
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
     }
+}
 
-done:
-
-    fbo->lw = lw;
-    fbo->lh = lh;
+void gl_vao_uninit(struct gl_vao *vao)
+{
+    GL *gl = vao->gl;
+    if (!gl)
+        return;
 
-    fbo->fbo = (struct fbodst){
-        .tex = fbo->tex,
-    };
+    if (gl->DeleteVertexArrays)
+        gl->DeleteVertexArrays(1, &vao->vao);
+    gl->DeleteBuffers(1, &vao->buffer);
 
-    return true;
+    *vao = (struct gl_vao){0};
 }
 
-void fbotex_uninit(struct fbotex *fbo)
+static void gl_vao_bind(struct gl_vao *vao)
 {
-    if (fbo->ra) {
-        ra_tex_free(fbo->ra, &fbo->tex);
-        *fbo = (struct fbotex) {0};
+    GL *gl = vao->gl;
+
+    if (gl->BindVertexArray) {
+        gl->BindVertexArray(vao->vao);
+    } else {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
+        gl_vao_enable_attribs(vao);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
     }
 }
 
-struct timer_pool {
-    struct ra *ra;
-    ra_timer *timer;
-    bool running; // detect invalid usage
-
-    uint64_t samples[VO_PERF_SAMPLE_COUNT];
-    int sample_idx;
-    int sample_count;
-
-    uint64_t sum;
-    uint64_t peak;
-};
-
-struct timer_pool *timer_pool_create(struct ra *ra)
+static void gl_vao_unbind(struct gl_vao *vao)
 {
-    if (!ra->fns->timer_create)
-        return NULL;
-
-    ra_timer *timer = ra->fns->timer_create(ra);
-    if (!timer)
-        return NULL;
+    GL *gl = vao->gl;
 
-    struct timer_pool *pool = talloc(NULL, struct timer_pool);
-    if (!pool) {
-        ra->fns->timer_destroy(ra, timer);
-        return NULL;
+    if (gl->BindVertexArray) {
+        gl->BindVertexArray(0);
+    } else {
+        for (int n = 0; n < vao->num_entries; n++)
+            gl->DisableVertexAttribArray(n);
     }
-
-    *pool = (struct timer_pool){ .ra = ra, .timer = timer };
-    return pool;
 }
 
-void timer_pool_destroy(struct timer_pool *pool)
+// Draw the vertex data (as described by the gl_vao_entry entries) in ptr
+// to the screen. num is the number of vertexes. prim is usually GL_TRIANGLES.
+// If ptr is NULL, then skip the upload, and use the data uploaded with the
+// previous call.
+void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num)
 {
-    if (!pool)
-        return;
+    GL *gl = vao->gl;
 
-    pool->ra->fns->timer_destroy(pool->ra, pool->timer);
-    talloc_free(pool);
-}
+    if (ptr) {
+        gl->BindBuffer(GL_ARRAY_BUFFER, vao->buffer);
+        gl->BufferData(GL_ARRAY_BUFFER, num * vao->stride, ptr, GL_STREAM_DRAW);
+        gl->BindBuffer(GL_ARRAY_BUFFER, 0);
+    }
 
-void timer_pool_start(struct timer_pool *pool)
-{
-    if (!pool)
-        return;
+    gl_vao_bind(vao);
+
+    gl->DrawArrays(prim, 0, num);
 
-    assert(!pool->running);
-    pool->ra->fns->timer_start(pool->ra, pool->timer);
-    pool->running = true;
+    gl_vao_unbind(vao);
 }
 
-void timer_pool_stop(struct timer_pool *pool)
+static void GLAPIENTRY gl_debug_cb(GLenum source, GLenum type, GLuint id,
+                                   GLenum severity, GLsizei length,
+                                   const GLchar *message, const void *userParam)
 {
-    if (!pool)
-        return;
-
-    assert(pool->running);
-    uint64_t res = pool->ra->fns->timer_stop(pool->ra, pool->timer);
-    pool->running = false;
-
-    if (res) {
-        // Input res into the buffer and grab the previous value
-        uint64_t old = pool->samples[pool->sample_idx];
-        pool->sample_count = MPMIN(pool->sample_count + 1, VO_PERF_SAMPLE_COUNT);
-        pool->samples[pool->sample_idx++] = res;
-        pool->sample_idx %= VO_PERF_SAMPLE_COUNT;
-        pool->sum = pool->sum + res - old;
-
-        // Update peak if necessary
-        if (res >= pool->peak) {
-            pool->peak = res;
-        } else if (pool->peak == old) {
-            // It's possible that the last peak was the value we just removed,
-            // if so we need to scan for the new peak
-            uint64_t peak = res;
-            for (int i = 0; i < VO_PERF_SAMPLE_COUNT; i++)
-                peak = MPMAX(peak, pool->samples[i]);
-            pool->peak = peak;
-        }
+    // keep in mind that the debug callback can be asynchronous
+    struct mp_log *log = (void *)userParam;
+    int level = MSGL_ERR;
+    switch (severity) {
+    case GL_DEBUG_SEVERITY_NOTIFICATION:level = MSGL_V; break;
+    case GL_DEBUG_SEVERITY_LOW:         level = MSGL_INFO; break;
+    case GL_DEBUG_SEVERITY_MEDIUM:      level = MSGL_WARN; break;
+    case GL_DEBUG_SEVERITY_HIGH:        level = MSGL_ERR; break;
     }
+    mp_msg(log, level, "GL: %s\n", message);
 }
 
-struct mp_pass_perf timer_pool_measure(struct timer_pool *pool)
+void gl_set_debug_logger(GL *gl, struct mp_log *log)
 {
-    if (!pool)
-        return (struct mp_pass_perf){0};
-
-    struct mp_pass_perf res = {
-        .peak = pool->peak,
-        .count = pool->sample_count,
-    };
-
-    int idx = pool->sample_idx - pool->sample_count + VO_PERF_SAMPLE_COUNT;
-    for (int i = 0; i < res.count; i++) {
-        idx %= VO_PERF_SAMPLE_COUNT;
-        res.samples[i] = pool->samples[idx++];
-    }
-
-    if (res.count > 0) {
-        res.last = res.samples[res.count - 1];
-        res.avg = pool->sum / res.count;
-    }
-
-    return res;
+    if (gl->DebugMessageCallback)
+        gl->DebugMessageCallback(log ? gl_debug_cb : NULL, log);
 }
 
-void mp_log_source(struct mp_log *log, int lev, const char *src)
+// Given a GL combined extension string in extensions, find out whether ext
+// is included in it. Basically, a word search.
+bool gl_check_extension(const char *extensions, const char *ext)
 {
-    int line = 1;
-    if (!src)
-        return;
-    while (*src) {
-        const char *end = strchr(src, '\n');
-        const char *next = end + 1;
-        if (!end)
-            next = end = src + strlen(src);
-        mp_msg(log, lev, "[%3d] %.*s\n", line, (int)(end - src), src);
-        line++;
-        src = next;
+    int len = strlen(ext);
+    const char *cur = extensions;
+    while (cur) {
+        cur = strstr(cur, ext);
+        if (!cur)
+            break;
+        if ((cur == extensions || cur[-1] == ' ') &&
+            (cur[len] == '\0' || cur[len] == ' '))
+            return true;
+        cur += len;
     }
+    return false;
 }
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 7d00d26..53127e4 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -1,121 +1,56 @@
-#pragma once
+/*
+ * This file is part of mpv.
+ * Parts based on MPlayer code by Reimar Döffinger.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef MP_GL_UTILS_
+#define MP_GL_UTILS_
 
-#include <stdbool.h>
 #include <math.h>
 
-#include "video/out/vo.h"
-#include "ra.h"
+#include "video/out/gpu/utils.h"
+#include "common.h"
 
-// A 3x2 matrix, with the translation part separate.
-struct gl_transform {
-    // row-major, e.g. in mathematical notation:
-    //  | m[0][0] m[0][1] |
-    //  | m[1][0] m[1][1] |
-    float m[2][2];
-    float t[2];
-};
-
-static const struct gl_transform identity_trans = {
-    .m = {{1.0, 0.0}, {0.0, 1.0}},
-    .t = {0.0, 0.0},
-};
-
-void gl_transform_ortho(struct gl_transform *t, float x0, float x1,
-                        float y0, float y1);
-
-// This treats m as an affine transformation, in other words m[2][n] gets
-// added to the output.
-static inline void gl_transform_vec(struct gl_transform t, float *x, float *y)
-{
-    float vx = *x, vy = *y;
-    *x = vx * t.m[0][0] + vy * t.m[0][1] + t.t[0];
-    *y = vx * t.m[1][0] + vy * t.m[1][1] + t.t[1];
-}
-
-struct mp_rect_f {
-    float x0, y0, x1, y1;
-};
-
-// Semantic equality (fuzzy comparison)
-static inline bool mp_rect_f_seq(struct mp_rect_f a, struct mp_rect_f b)
-{
-    return fabs(a.x0 - b.x0) < 1e-6 && fabs(a.x1 - b.x1) < 1e-6 &&
-           fabs(a.y0 - b.y0) < 1e-6 && fabs(a.y1 - b.y1) < 1e-6;
-}
-
-static inline void gl_transform_rect(struct gl_transform t, struct mp_rect_f *r)
-{
-    gl_transform_vec(t, &r->x0, &r->y0);
-    gl_transform_vec(t, &r->x1, &r->y1);
-}
-
-static inline bool gl_transform_eq(struct gl_transform a, struct gl_transform b)
-{
-    for (int x = 0; x < 2; x++) {
-        for (int y = 0; y < 2; y++) {
-            if (a.m[x][y] != b.m[x][y])
-                return false;
-        }
-    }
-
-    return a.t[0] == b.t[0] && a.t[1] == b.t[1];
-}
-
-void gl_transform_trans(struct gl_transform t, struct gl_transform *x);
-
-struct fbodst {
-    struct ra_tex *tex;
-    bool flip; // mirror vertically
-};
-
-void gl_transform_ortho_fbodst(struct gl_transform *t, struct fbodst fbo);
-
-// A pool of buffers, which can grow as needed
-struct ra_buf_pool {
-    struct ra_buf_params current_params;
-    struct ra_buf **buffers;
-    int num_buffers;
-    int index;
-};
-
-void ra_buf_pool_uninit(struct ra *ra, struct ra_buf_pool *pool);
+struct mp_log;
 
-// Note: params->initial_data is *not* supported
-struct ra_buf *ra_buf_pool_get(struct ra *ra, struct ra_buf_pool *pool,
-                               const struct ra_buf_params *params);
+void gl_check_error(GL *gl, struct mp_log *log, const char *info);
 
-// Helper that wraps ra_tex_upload using texture upload buffers to ensure that
-// params->buf is always set. This is intended for RA-internal usage.
-bool ra_tex_upload_pbo(struct ra *ra, struct ra_buf_pool *pbo,
-                       const struct ra_tex_upload_params *params);
+void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
+                   const void *dataptr, int stride,
+                   int x, int y, int w, int h);
 
-// Layout rules for GLSL's packing modes
-struct ra_layout std140_layout(struct ra_renderpass_input *inp);
-struct ra_layout std430_layout(struct ra_renderpass_input *inp);
+mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
 
-struct fbotex {
-    struct ra *ra;
-    struct ra_tex *tex;
-    int lw, lh; // logical (configured) size, <= than texture size
-    struct fbodst fbo;
+struct gl_vao {
+    GL *gl;
+    GLuint vao;     // the VAO object, or 0 if unsupported by driver
+    GLuint buffer;  // GL_ARRAY_BUFFER used for the data
+    int stride;     // size of each element (interleaved elements are assumed)
+    const struct ra_renderpass_input *entries;
+    int num_entries;
 };
 
-void fbotex_uninit(struct fbotex *fbo);
-bool fbotex_change(struct fbotex *fbo, struct ra *ra, struct mp_log *log,
-                   int w, int h, const struct ra_format *fmt, int flags);
-#define FBOTEX_FUZZY_W 1
-#define FBOTEX_FUZZY_H 2
-#define FBOTEX_FUZZY (FBOTEX_FUZZY_W | FBOTEX_FUZZY_H)
+void gl_vao_init(struct gl_vao *vao, GL *gl, int stride,
+                 const struct ra_renderpass_input *entries,
+                 int num_entries);
+void gl_vao_uninit(struct gl_vao *vao);
+void gl_vao_draw_data(struct gl_vao *vao, GLenum prim, void *ptr, size_t num);
 
-// A wrapper around ra_timer that does result pooling, averaging etc.
-struct timer_pool;
+void gl_set_debug_logger(GL *gl, struct mp_log *log);
 
-struct timer_pool *timer_pool_create(struct ra *ra);
-void timer_pool_destroy(struct timer_pool *pool);
-void timer_pool_start(struct timer_pool *pool);
-void timer_pool_stop(struct timer_pool *pool);
-struct mp_pass_perf timer_pool_measure(struct timer_pool *pool);
+bool gl_check_extension(const char *extensions, const char *ext);
 
-// print a multi line string with line numbers (e.g. for shader sources)
-// log, lev: module and log level, as in mp_msg()
-void mp_log_source(struct mp_log *log, int lev, const char *src);
+#endif
diff --git a/video/out/vo.c b/video/out/vo.c
index f9c5d04..63f5b34 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -47,10 +47,11 @@
 #include "osdep/io.h"
 #include "osdep/threads.h"
 
+extern const struct vo_driver video_out_mediacodec_embed;
 extern const struct vo_driver video_out_x11;
 extern const struct vo_driver video_out_vdpau;
 extern const struct vo_driver video_out_xv;
-extern const struct vo_driver video_out_opengl;
+extern const struct vo_driver video_out_gpu;
 extern const struct vo_driver video_out_opengl_cb;
 extern const struct vo_driver video_out_null;
 extern const struct vo_driver video_out_image;
@@ -60,34 +61,31 @@ extern const struct vo_driver video_out_drm;
 extern const struct vo_driver video_out_direct3d;
 extern const struct vo_driver video_out_sdl;
 extern const struct vo_driver video_out_vaapi;
-extern const struct vo_driver video_out_wayland;
 extern const struct vo_driver video_out_rpi;
 extern const struct vo_driver video_out_tct;
 
 const struct vo_driver *const video_out_drivers[] =
 {
+#if HAVE_ANDROID
+    &video_out_mediacodec_embed,
+#endif
 #if HAVE_RPI
     &video_out_rpi,
 #endif
-#if HAVE_GL
-    &video_out_opengl,
-#endif
+    &video_out_gpu,
 #if HAVE_VDPAU
     &video_out_vdpau,
 #endif
 #if HAVE_DIRECT3D
     &video_out_direct3d,
 #endif
-#if HAVE_WAYLAND
-    &video_out_wayland,
-#endif
 #if HAVE_XV
     &video_out_xv,
 #endif
 #if HAVE_SDL2
     &video_out_sdl,
 #endif
-#if HAVE_VAAPI_X11
+#if HAVE_VAAPI_X11 && HAVE_GPL
     &video_out_vaapi,
 #endif
 #if HAVE_X11
@@ -136,6 +134,8 @@ struct vo_internal {
 
     int64_t nominal_vsync_interval;
 
+    bool external_renderloop_drive;
+
     int64_t vsync_interval;
     int64_t *vsync_samples;
     int num_vsync_samples;
@@ -196,8 +196,9 @@ const struct m_obj_list vo_obj_list = {
     .get_desc = get_desc,
     .description = "video outputs",
     .aliases = {
-        {"gl", "opengl"},
+        {"gl", "gpu"},
         {"direct3d_shaders", "direct3d"},
+        {"opengl", "gpu"},
         {0}
     },
     .allow_unknown_entries = true,
@@ -789,11 +790,12 @@ static void wait_until(struct vo *vo, int64_t target)
     pthread_mutex_unlock(&in->lock);
 }
 
-static bool render_frame(struct vo *vo)
+bool vo_render_frame_external(struct vo *vo)
 {
     struct vo_internal *in = vo->in;
     struct vo_frame *frame = NULL;
     bool got_frame = false;
+    bool flipped = false;
 
     update_display_fps(vo);
 
@@ -855,6 +857,7 @@ static bool render_frame(struct vo *vo)
     if (in->dropped_frame) {
         in->drop_count += 1;
     } else {
+        flipped = true;
         in->rendering = true;
         in->hasframe_rendered = true;
         int64_t prev_drop_count = vo->in->drop_count;
@@ -886,6 +889,11 @@ static bool render_frame(struct vo *vo)
         update_vsync_timing_after_swap(vo);
     }
 
+    if (vo->driver->caps & VO_CAP_NOREDRAW) {
+        talloc_free(in->current_frame);
+        in->current_frame = NULL;
+    }
+
     if (in->dropped_frame) {
         MP_STATS(vo, "drop-vo");
     } else {
@@ -900,6 +908,8 @@ static bool render_frame(struct vo *vo)
 done:
     talloc_free(frame);
     pthread_mutex_unlock(&in->lock);
+    if (in->external_renderloop_drive)
+        return flipped;
     return got_frame || (in->frame_queued && in->frame_queued->display_synced);
 }
 
@@ -907,7 +917,7 @@ static void do_redraw(struct vo *vo)
 {
     struct vo_internal *in = vo->in;
 
-    if (!vo->config_ok)
+    if (!vo->config_ok || (vo->driver->caps & VO_CAP_NOREDRAW))
         return;
 
     pthread_mutex_lock(&in->lock);
@@ -942,6 +952,44 @@ static void do_redraw(struct vo *vo)
         talloc_free(frame);
 }
 
+static void drop_unrendered_frame(struct vo *vo)
+{
+    struct vo_internal *in = vo->in;
+
+    pthread_mutex_lock(&in->lock);
+
+    if (!in->frame_queued)
+        goto end;
+
+    if ((in->frame_queued->pts + in->frame_queued->duration) > mp_time_us())
+        goto end;
+
+    MP_VERBOSE(vo, "Dropping unrendered frame (pts %"PRId64")\n", in->frame_queued->pts);
+
+    talloc_free(in->frame_queued);
+    in->frame_queued = NULL;
+    in->hasframe = false;
+    pthread_cond_broadcast(&in->wakeup);
+    wakeup_core(vo);
+
+end:
+    pthread_mutex_unlock(&in->lock);
+}
+
+void vo_enable_external_renderloop(struct vo *vo)
+{
+    struct vo_internal *in = vo->in;
+    MP_VERBOSE(vo, "Enabling event driven renderloop!\n");
+    in->external_renderloop_drive = true;
+}
+
+void vo_disable_external_renderloop(struct vo *vo)
+{
+    struct vo_internal *in = vo->in;
+    MP_VERBOSE(vo, "Disabling event driven renderloop!\n");
+    in->external_renderloop_drive = false;
+}
+
 static void *vo_thread(void *ptr)
 {
     struct vo *vo = ptr;
@@ -963,7 +1011,11 @@ static void *vo_thread(void *ptr)
         if (in->terminate)
             break;
         vo->driver->control(vo, VOCTRL_CHECK_EVENTS, NULL);
-        bool working = render_frame(vo);
+        bool working = false;
+        if (!in->external_renderloop_drive || !in->hasframe_rendered)
+            working = vo_render_frame_external(vo);
+        else
+            drop_unrendered_frame(vo);
         int64_t now = mp_time_us();
         int64_t wait_until = now + (working ? 0 : (int64_t)1e9);
 
@@ -976,7 +1028,7 @@ static void *vo_thread(void *ptr)
                 wakeup_core(vo);
             }
         }
-        if (vo->want_redraw && !in->want_redraw) {
+        if (vo->want_redraw) {
             vo->want_redraw = false;
             in->want_redraw = true;
             wakeup_core(vo);
diff --git a/video/out/vo.h b/video/out/vo.h
index 2a0c3ef..995d6b9 100644
--- a/video/out/vo.h
+++ b/video/out/vo.h
@@ -172,6 +172,8 @@ enum {
     VO_CAP_ROTATE90     = 1 << 0,
     // VO does framedrop itself (vo_vdpau). Untimed/encoding VOs never drop.
     VO_CAP_FRAMEDROP    = 1 << 1,
+    // VO does not support redraws (vo_mediacodec_embed).
+    VO_CAP_NOREDRAW     = 1 << 2,
 };
 
 #define VO_MAX_REQ_FRAMES 10
@@ -374,7 +376,7 @@ struct vo {
     struct vo_x11_state *x11;
     struct vo_w32_state *w32;
     struct vo_cocoa_state *cocoa;
-    struct vo_wayland_state *wayland;
+    struct vo_wayland_state *wl;
     struct mp_hwdec_devices *hwdec_devs;
     struct input_ctx *input_ctx;
     struct osd_state *osd;
@@ -431,6 +433,9 @@ void vo_query_formats(struct vo *vo, uint8_t *list);
 void vo_event(struct vo *vo, int event);
 int vo_query_and_reset_events(struct vo *vo, int events);
 struct mp_image *vo_get_current_frame(struct vo *vo);
+void vo_enable_external_renderloop(struct vo *vo);
+void vo_disable_external_renderloop(struct vo *vo);
+bool vo_render_frame_external(struct vo *vo);
 void vo_set_queue_params(struct vo *vo, int64_t offset_us, int num_req_frames);
 int vo_get_num_req_frames(struct vo *vo);
 int64_t vo_get_vsync_interval(struct vo *vo);
diff --git a/video/out/vo_caca.c b/video/out/vo_caca.c
index 46090af..e63bd69 100644
--- a/video/out/vo_caca.c
+++ b/video/out/vo_caca.c
@@ -42,6 +42,11 @@
 #include "common/msg.h"
 #include "input/input.h"
 
+#include "config.h"
+#if !HAVE_GPL
+#error GPL only
+#endif
+
 struct priv {
     caca_canvas_t  *canvas;
     caca_display_t *display;
diff --git a/video/out/vo_direct3d.c b/video/out/vo_direct3d.c
index 952dca8..a131d21 100644
--- a/video/out/vo_direct3d.c
+++ b/video/out/vo_direct3d.c
@@ -40,6 +40,11 @@
 #include "w32_common.h"
 #include "sub/osd.h"
 
+#include "config.h"
+#if !HAVE_GPL
+#error GPL only
+#endif
+
 // shaders generated by fxc.exe from d3d_shader_yuv.hlsl
 #include "d3d_shader_420p.h"
 
diff --git a/video/out/vo_drm.c b/video/out/vo_drm.c
index 2fdd840..24189d5 100644
--- a/video/out/vo_drm.c
+++ b/video/out/vo_drm.c
@@ -412,7 +412,9 @@ static int preinit(struct vo *vo)
     }
 
     p->kms = kms_create(
-        vo->log, vo->opts->drm_connector_spec, vo->opts->drm_mode_id);
+        vo->log, vo->opts->drm_opts->drm_connector_spec,
+                 vo->opts->drm_opts->drm_mode_id,
+                 vo->opts->drm_opts->drm_overlay_id);
     if (!p->kms) {
         MP_ERR(vo, "Failed to create KMS.\n");
         goto err;
diff --git a/video/out/vo_gpu.c b/video/out/vo_gpu.c
new file mode 100644
index 0000000..95318d3
--- /dev/null
+++ b/video/out/vo_gpu.c
@@ -0,0 +1,336 @@
+/*
+ * Based on vo_gl.c by Reimar Doeffinger.
+ *
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include <libavutil/common.h>
+
+#include "config.h"
+
+#include "mpv_talloc.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "common/msg.h"
+#include "common/global.h"
+#include "options/m_config.h"
+#include "vo.h"
+#include "video/mp_image.h"
+#include "sub/osd.h"
+
+#include "gpu/context.h"
+#include "gpu/hwdec.h"
+#include "gpu/video.h"
+
+struct gpu_priv {
+    struct mp_log *log;
+    struct ra_ctx *ctx;
+
+    char *context_name;
+    char *context_type;
+    struct ra_ctx_opts opts;
+    struct gl_video *renderer;
+
+    int events;
+};
+
+static void resize(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
+
+    struct mp_rect src, dst;
+    struct mp_osd_res osd;
+    vo_get_src_dst_rects(vo, &src, &dst, &osd);
+
+    gl_video_resize(p->renderer, &src, &dst, &osd);
+
+    int fb_depth = sw->fns->color_depth ? sw->fns->color_depth(sw) : 0;
+    if (fb_depth)
+        MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth);
+    gl_video_set_fb_depth(p->renderer, fb_depth);
+
+    vo->want_redraw = true;
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    struct ra_fbo fbo;
+    if (!sw->fns->start_frame(sw, &fbo))
+        return;
+
+    gl_video_render_frame(p->renderer, frame, fbo);
+    if (!sw->fns->submit_frame(sw, frame)) {
+        MP_ERR(vo, "Failed presenting frame!\n");
+        return;
+    }
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+    sw->fns->swap_buffers(sw);
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    struct gpu_priv *p = vo->priv;
+    if (!gl_video_check_format(p->renderer, format))
+        return 0;
+    return 1;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    struct gpu_priv *p = vo->priv;
+
+    if (!p->ctx->fns->reconfig(p->ctx))
+        return -1;
+
+    resize(vo);
+    gl_video_config(p->renderer, params);
+
+    return 0;
+}
+
+static void request_hwdec_api(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+
+    gl_video_load_hwdecs_all(p->renderer, vo->hwdec_devs);
+}
+
+static void call_request_hwdec_api(void *ctx)
+{
+    // Roundabout way to run hwdec loading on the VO thread.
+    // Redirects to request_hwdec_api().
+    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, NULL);
+}
+
+static void get_and_update_icc_profile(struct gpu_priv *p)
+{
+    if (gl_video_icc_auto_enabled(p->renderer)) {
+        MP_VERBOSE(p, "Querying ICC profile...\n");
+        bstr icc = bstr0(NULL);
+        int r = p->ctx->fns->control(p->ctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
+
+        if (r != VO_NOTAVAIL) {
+            if (r == VO_FALSE) {
+                MP_WARN(p, "Could not retrieve an ICC profile.\n");
+            } else if (r == VO_NOTIMPL) {
+                MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
+            }
+
+            gl_video_set_icc_profile(p->renderer, icc);
+        }
+    }
+}
+
+static void get_and_update_ambient_lighting(struct gpu_priv *p)
+{
+    int lux;
+    int r = p->ctx->fns->control(p->ctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
+    if (r == VO_TRUE) {
+        gl_video_set_ambient_lux(p->renderer, lux);
+    }
+    if (r != VO_TRUE && gl_video_gamma_auto_enabled(p->renderer)) {
+        MP_ERR(p, "gamma_auto option provided, but querying for ambient"
+                  " lighting is not supported on this platform\n");
+    }
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    struct gpu_priv *p = vo->priv;
+    struct ra_swapchain *sw = p->ctx->swapchain;
+
+    switch (request) {
+    case VOCTRL_SET_PANSCAN:
+        resize(vo);
+        return VO_TRUE;
+    case VOCTRL_SET_EQUALIZER:
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_SCREENSHOT_WIN: {
+        struct mp_image *screen = NULL;
+        if (sw->fns->screenshot)
+            screen = sw->fns->screenshot(sw);
+        if (!screen)
+            break; // redirect to backend
+        // set image parameters according to the display, if possible
+        screen->params.color = gl_video_get_output_colorspace(p->renderer);
+        *(struct mp_image **)data = screen;
+        return true;
+    }
+    case VOCTRL_LOAD_HWDEC_API:
+        request_hwdec_api(vo);
+        return true;
+    case VOCTRL_UPDATE_RENDER_OPTS: {
+        gl_video_configure_queue(p->renderer, vo);
+        get_and_update_icc_profile(p);
+        vo->want_redraw = true;
+        return true;
+    }
+    case VOCTRL_RESET:
+        gl_video_reset(p->renderer);
+        return true;
+    case VOCTRL_PAUSE:
+        if (gl_video_showing_interpolated_frame(p->renderer))
+            vo->want_redraw = true;
+        break;
+    case VOCTRL_PERFORMANCE_DATA:
+        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
+        return true;
+    }
+
+    int events = 0;
+    int r = p->ctx->fns->control(p->ctx, &events, request, data);
+    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
+        get_and_update_icc_profile(p);
+        vo->want_redraw = true;
+    }
+    if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
+        get_and_update_ambient_lighting(p);
+        vo->want_redraw = true;
+    }
+    events |= p->events;
+    p->events = 0;
+    if (events & VO_EVENT_RESIZE)
+        resize(vo);
+    if (events & VO_EVENT_EXPOSE)
+        vo->want_redraw = true;
+    vo_event(vo, events);
+
+    return r;
+}
+
+static void wakeup(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    if (p->ctx && p->ctx->fns->wakeup)
+        p->ctx->fns->wakeup(p->ctx);
+}
+
+static void wait_events(struct vo *vo, int64_t until_time_us)
+{
+    struct gpu_priv *p = vo->priv;
+    if (p->ctx && p->ctx->fns->wait_events) {
+        p->ctx->fns->wait_events(p->ctx, until_time_us);
+    } else {
+        vo_wait_default(vo, until_time_us);
+    }
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct gpu_priv *p = vo->priv;
+
+    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
+}
+
+static void uninit(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+
+    gl_video_uninit(p->renderer);
+    if (vo->hwdec_devs) {
+        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
+        hwdec_devices_destroy(vo->hwdec_devs);
+    }
+    ra_ctx_destroy(&p->ctx);
+}
+
+static int preinit(struct vo *vo)
+{
+    struct gpu_priv *p = vo->priv;
+    p->log = vo->log;
+
+    int alpha_mode;
+    mp_read_option_raw(vo->global, "alpha", &m_option_type_choice, &alpha_mode);
+
+    struct ra_ctx_opts opts = p->opts;
+    opts.want_alpha = alpha_mode == 1;
+
+    p->ctx = ra_ctx_create(vo, p->context_type, p->context_name, opts);
+    if (!p->ctx)
+        goto err_out;
+    assert(p->ctx->ra);
+    assert(p->ctx->swapchain);
+
+    p->renderer = gl_video_init(p->ctx->ra, vo->log, vo->global);
+    gl_video_set_osd_source(p->renderer, vo->osd);
+    gl_video_configure_queue(p->renderer, vo);
+
+    get_and_update_icc_profile(p);
+
+    vo->hwdec_devs = hwdec_devices_create();
+    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
+
+    gl_video_load_hwdecs(p->renderer, vo->hwdec_devs, false);
+
+    return 0;
+
+err_out:
+    uninit(vo);
+    return -1;
+}
+
+#define OPT_BASE_STRUCT struct gpu_priv
+static const m_option_t options[] = {
+    OPT_STRING_VALIDATE("gpu-context", context_name, 0, ra_ctx_validate_context),
+    OPT_STRING_VALIDATE("gpu-api", context_type, 0, ra_ctx_validate_api),
+    OPT_FLAG("gpu-debug", opts.debug, 0),
+    OPT_FLAG("gpu-sw", opts.allow_sw, 0),
+    OPT_INTRANGE("swapchain-depth", opts.swapchain_depth, 0, 1, 8),
+    {0}
+};
+
+static const struct gpu_priv defaults = { .opts = {
+    .swapchain_depth = 3,
+}};
+
+const struct vo_driver video_out_gpu = {
+    .description = "Shader-based GPU Renderer",
+    .name = "gpu",
+    .caps = VO_CAP_ROTATE90,
+    .preinit = preinit,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .wait_events = wait_events,
+    .wakeup = wakeup,
+    .uninit = uninit,
+    .priv_size = sizeof(struct gpu_priv),
+    .priv_defaults = &defaults,
+    .options = options,
+};
diff --git a/video/out/vo_lavc.c b/video/out/vo_lavc.c
index be7de12..4b69231 100644
--- a/video/out/vo_lavc.c
+++ b/video/out/vo_lavc.c
@@ -49,7 +49,6 @@ struct priv {
     int64_t mindeltapts;
     double expected_next_pts;
     mp_image_t *lastimg;
-    int lastimg_wants_osd;
     int lastdisplaycount;
 
     AVRational worst_time_base;
@@ -287,6 +286,14 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
 
     double pts = mpi ? mpi->pts : MP_NOPTS_VALUE;
 
+    if (mpi) {
+        assert(vo->params);
+
+        struct mp_osd_res dim = osd_res_from_image_params(vo->params);
+
+        osd_draw_on_image(vo->osd, dim, mpi->pts, OSD_DRAW_SUB_ONLY, mpi);
+    }
+
     if (!vc || vc->shutdown)
         goto done;
     if (!encode_lavc_start(ectx)) {
@@ -451,7 +458,6 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
             talloc_free(vc->lastimg);
             vc->lastimg = mpi;
             mpi = NULL;
-            vc->lastimg_wants_osd = true;
 
             vc->lastframeipts = vc->lastipts = frameipts;
             if (ectx->options->rawts && vc->lastipts < 0) {
@@ -462,17 +468,9 @@ static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
         } else {
             MP_INFO(vo, "Frame at pts %d got dropped "
                     "entirely because pts went backwards\n", (int) frameipts);
-            vc->lastimg_wants_osd = false;
         }
     }
 
-    if (vc->lastimg && vc->lastimg_wants_osd && vo->params) {
-        struct mp_osd_res dim = osd_res_from_image_params(vo->params);
-
-        osd_draw_on_image(vo->osd, dim, vc->lastimg->pts, OSD_DRAW_SUB_ONLY,
-                          vc->lastimg);
-    }
-
 done:
     talloc_free(mpi);
 }
diff --git a/video/out/vo_mediacodec_embed.c b/video/out/vo_mediacodec_embed.c
new file mode 100644
index 0000000..63975e9
--- /dev/null
+++ b/video/out/vo_mediacodec_embed.c
@@ -0,0 +1,119 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <libavcodec/mediacodec.h>
+#include <libavutil/hwcontext.h>
+#include <libavutil/hwcontext_mediacodec.h>
+
+#include "common/common.h"
+#include "vo.h"
+#include "video/mp_image.h"
+#include "video/hwdec.h"
+
+struct priv {
+    struct mp_image *next_image;
+    struct mp_hwdec_ctx hwctx;
+};
+
+static AVBufferRef *create_mediacodec_device_ref(struct vo *vo)
+{
+    AVBufferRef *device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_MEDIACODEC);
+    if (!device_ref)
+        return NULL;
+
+    AVHWDeviceContext *ctx = (void *)device_ref->data;
+    AVMediaCodecDeviceContext *hwctx = ctx->hwctx;
+    hwctx->surface = (void *)(intptr_t)(vo->opts->WinID);
+
+    if (av_hwdevice_ctx_init(device_ref) < 0)
+        av_buffer_unref(&device_ref);
+
+    return device_ref;
+}
+
+static int preinit(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    vo->hwdec_devs = hwdec_devices_create();
+    p->hwctx = (struct mp_hwdec_ctx){
+        .driver_name = "mediacodec_embed",
+        .av_device_ref = create_mediacodec_device_ref(vo),
+    };
+    hwdec_devices_add(vo->hwdec_devs, &p->hwctx);
+    return 0;
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    if (!p->next_image)
+        return;
+
+    AVMediaCodecBuffer *buffer = (AVMediaCodecBuffer *)p->next_image->planes[3];
+    av_mediacodec_release_buffer(buffer, 1);
+    mp_image_unrefp(&p->next_image);
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct priv *p = vo->priv;
+
+    mp_image_t *mpi = NULL;
+    if (!frame->redraw && !frame->repeat)
+        mpi = mp_image_new_ref(frame->current);
+
+    talloc_free(p->next_image);
+    p->next_image = mpi;
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    return format == IMGFMT_MEDIACODEC;
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    return VO_NOTIMPL;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    return 0;
+}
+
+static void uninit(struct vo *vo)
+{
+    struct priv *p = vo->priv;
+    mp_image_unrefp(&p->next_image);
+
+    hwdec_devices_remove(vo->hwdec_devs, &p->hwctx);
+    av_buffer_unref(&p->hwctx.av_device_ref);
+}
+
+const struct vo_driver video_out_mediacodec_embed = {
+    .description = "Android (Embedded MediaCodec Surface)",
+    .name = "mediacodec_embed",
+    .caps = VO_CAP_NOREDRAW,
+    .preinit = preinit,
+    .query_format = query_format,
+    .control = control,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .reconfig = reconfig,
+    .uninit = uninit,
+    .priv_size = sizeof(struct priv),
+};
diff --git a/video/out/vo_opengl.c b/video/out/vo_opengl.c
deleted file mode 100644
index 72691e5..0000000
--- a/video/out/vo_opengl.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Based on vo_gl.c by Reimar Doeffinger.
- *
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <stdbool.h>
-#include <assert.h>
-
-#include <libavutil/common.h>
-
-#include "config.h"
-
-#include "mpv_talloc.h"
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "common/msg.h"
-#include "common/global.h"
-#include "options/m_config.h"
-#include "vo.h"
-#include "video/mp_image.h"
-#include "sub/osd.h"
-
-#include "opengl/context.h"
-#include "opengl/utils.h"
-#include "opengl/hwdec.h"
-#include "opengl/osd.h"
-#include "filter_kernels.h"
-#include "video/hwdec.h"
-#include "opengl/video.h"
-#include "opengl/ra_gl.h"
-
-#define NUM_VSYNC_FENCES 10
-
-struct vo_opengl_opts {
-    int use_glFinish;
-    int waitvsync;
-    int use_gl_debug;
-    int allow_sw;
-    int swap_interval;
-    int vsync_fences;
-    char *backend;
-    int es;
-    int pattern[2];
-};
-
-struct gl_priv {
-    struct vo *vo;
-    struct mp_log *log;
-    MPGLContext *glctx;
-    GL *gl;
-    struct ra *ra;
-
-    struct vo_opengl_opts opts;
-
-    struct gl_video *renderer;
-
-    struct ra_hwdec *hwdec;
-
-    int events;
-
-    int frames_rendered;
-    unsigned int prev_sgi_sync_count;
-
-    // check-pattern sub-option; for testing/debugging
-    int last_pattern;
-    int matches, mismatches;
-
-    GLsync vsync_fences[NUM_VSYNC_FENCES];
-    int num_vsync_fences;
-};
-
-static void resize(struct gl_priv *p)
-{
-    struct vo *vo = p->vo;
-
-    MP_VERBOSE(vo, "Resize: %dx%d\n", vo->dwidth, vo->dheight);
-
-    struct mp_rect src, dst;
-    struct mp_osd_res osd;
-    vo_get_src_dst_rects(vo, &src, &dst, &osd);
-
-    gl_video_resize(p->renderer, &src, &dst, &osd);
-
-    vo->want_redraw = true;
-}
-
-static void check_pattern(struct vo *vo, int item)
-{
-    struct gl_priv *p = vo->priv;
-    int expected = p->opts.pattern[p->last_pattern];
-    if (item == expected) {
-        p->last_pattern++;
-        if (p->last_pattern >= 2)
-            p->last_pattern = 0;
-        p->matches++;
-    } else {
-        p->mismatches++;
-        MP_WARN(vo, "wrong pattern, expected %d got %d (hit: %d, mis: %d)\n",
-                expected, item, p->matches, p->mismatches);
-    }
-}
-
-static void draw_frame(struct vo *vo, struct vo_frame *frame)
-{
-    struct gl_priv *p = vo->priv;
-    GL *gl = p->gl;
-
-    mpgl_start_frame(p->glctx);
-
-    if (gl->FenceSync && p->num_vsync_fences < p->opts.vsync_fences) {
-        GLsync fence = gl->FenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);;
-        if (fence)
-            p->vsync_fences[p->num_vsync_fences++] = fence;
-    }
-
-    struct fbodst target = {
-        .tex = ra_create_wrapped_fb(p->ra, p->glctx->main_fb,
-                                    vo->dwidth, vo->dheight),
-        .flip = !p->glctx->flip_v,
-    };
-    gl_video_render_frame(p->renderer, frame, target);
-    ra_tex_free(p->ra, &target.tex);
-
-    if (p->opts.use_glFinish)
-        gl->Finish();
-}
-
-static void flip_page(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    GL *gl = p->gl;
-
-    mpgl_swap_buffers(p->glctx);
-
-    p->frames_rendered++;
-    if (p->frames_rendered > 5 && !p->opts.use_gl_debug)
-        ra_gl_set_debug(p->ra, false);
-
-    if (p->opts.use_glFinish)
-        gl->Finish();
-
-    if (p->opts.waitvsync || p->opts.pattern[0]) {
-        if (gl->GetVideoSync) {
-            unsigned int n1 = 0, n2 = 0;
-            gl->GetVideoSync(&n1);
-            if (p->opts.waitvsync)
-                gl->WaitVideoSync(2, (n1 + 1) % 2, &n2);
-            int step = n1 - p->prev_sgi_sync_count;
-            p->prev_sgi_sync_count = n1;
-            MP_DBG(vo, "Flip counts: %u->%u, step=%d\n", n1, n2, step);
-            if (p->opts.pattern[0])
-                check_pattern(vo, step);
-        } else {
-            MP_WARN(vo, "GLX_SGI_video_sync not available, disabling.\n");
-            p->opts.waitvsync = 0;
-            p->opts.pattern[0] = 0;
-        }
-    }
-    while (p->opts.vsync_fences > 0 && p->num_vsync_fences >= p->opts.vsync_fences) {
-        gl->ClientWaitSync(p->vsync_fences[0], GL_SYNC_FLUSH_COMMANDS_BIT, 1e9);
-        gl->DeleteSync(p->vsync_fences[0]);
-        MP_TARRAY_REMOVE_AT(p->vsync_fences, p->num_vsync_fences, 0);
-    }
-}
-
-static int query_format(struct vo *vo, int format)
-{
-    struct gl_priv *p = vo->priv;
-    if (!gl_video_check_format(p->renderer, format))
-        return 0;
-    return 1;
-}
-
-static int reconfig(struct vo *vo, struct mp_image_params *params)
-{
-    struct gl_priv *p = vo->priv;
-
-    if (mpgl_reconfig_window(p->glctx) < 0)
-        return -1;
-
-    resize(p);
-
-    gl_video_config(p->renderer, params);
-
-    return 0;
-}
-
-static void request_hwdec_api(struct vo *vo, void *api)
-{
-    struct gl_priv *p = vo->priv;
-
-    if (p->hwdec)
-        return;
-
-    p->hwdec = ra_hwdec_load_api(p->vo->log, p->ra, p->vo->global,
-                                 vo->hwdec_devs, (intptr_t)api);
-    gl_video_set_hwdec(p->renderer, p->hwdec);
-}
-
-static void call_request_hwdec_api(void *ctx, enum hwdec_type type)
-{
-    // Roundabout way to run hwdec loading on the VO thread.
-    // Redirects to request_hwdec_api().
-    vo_control(ctx, VOCTRL_LOAD_HWDEC_API, (void *)(intptr_t)type);
-}
-
-static void get_and_update_icc_profile(struct gl_priv *p)
-{
-    if (gl_video_icc_auto_enabled(p->renderer)) {
-        MP_VERBOSE(p, "Querying ICC profile...\n");
-        bstr icc = bstr0(NULL);
-        int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_ICC_PROFILE, &icc);
-
-        if (r != VO_NOTAVAIL) {
-            if (r == VO_FALSE) {
-                MP_WARN(p, "Could not retrieve an ICC profile.\n");
-            } else if (r == VO_NOTIMPL) {
-                MP_ERR(p, "icc-profile-auto not implemented on this platform.\n");
-            }
-
-            gl_video_set_icc_profile(p->renderer, icc);
-        }
-    }
-}
-
-static void get_and_update_ambient_lighting(struct gl_priv *p)
-{
-    int lux;
-    int r = mpgl_control(p->glctx, &p->events, VOCTRL_GET_AMBIENT_LUX, &lux);
-    if (r == VO_TRUE) {
-        gl_video_set_ambient_lux(p->renderer, lux);
-    }
-    if (r != VO_TRUE && gl_video_gamma_auto_enabled(p->renderer)) {
-        MP_ERR(p, "gamma_auto option provided, but querying for ambient"
-                  " lighting is not supported on this platform\n");
-    }
-}
-
-static int control(struct vo *vo, uint32_t request, void *data)
-{
-    struct gl_priv *p = vo->priv;
-
-    switch (request) {
-    case VOCTRL_SET_PANSCAN:
-        resize(p);
-        return VO_TRUE;
-    case VOCTRL_SET_EQUALIZER:
-        vo->want_redraw = true;
-        return VO_TRUE;
-    case VOCTRL_SCREENSHOT_WIN: {
-        struct mp_image *screen = gl_read_fbo_contents(p->gl, p->glctx->main_fb,
-                                                       vo->dwidth, vo->dheight);
-        if (!screen)
-            break; // redirect to backend
-        // set image parameters according to the display, if possible
-        screen->params.color = gl_video_get_output_colorspace(p->renderer);
-        if (p->glctx->flip_v)
-            mp_image_vflip(screen);
-        *(struct mp_image **)data = screen;
-        return true;
-    }
-    case VOCTRL_LOAD_HWDEC_API:
-        request_hwdec_api(vo, data);
-        return true;
-    case VOCTRL_UPDATE_RENDER_OPTS: {
-        gl_video_update_options(p->renderer);
-        get_and_update_icc_profile(p);
-        gl_video_configure_queue(p->renderer, p->vo);
-        p->vo->want_redraw = true;
-        return true;
-    }
-    case VOCTRL_RESET:
-        gl_video_reset(p->renderer);
-        return true;
-    case VOCTRL_PAUSE:
-        if (gl_video_showing_interpolated_frame(p->renderer))
-            vo->want_redraw = true;
-        return true;
-    case VOCTRL_PERFORMANCE_DATA:
-        gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
-        return true;
-    }
-
-    int events = 0;
-    int r = mpgl_control(p->glctx, &events, request, data);
-    if (events & VO_EVENT_ICC_PROFILE_CHANGED) {
-        get_and_update_icc_profile(p);
-        vo->want_redraw = true;
-    }
-    if (events & VO_EVENT_AMBIENT_LIGHTING_CHANGED) {
-        get_and_update_ambient_lighting(p);
-        vo->want_redraw = true;
-    }
-    events |= p->events;
-    p->events = 0;
-    if (events & VO_EVENT_RESIZE)
-        resize(p);
-    if (events & VO_EVENT_EXPOSE)
-        vo->want_redraw = true;
-    vo_event(vo, events);
-
-    return r;
-}
-
-static void wakeup(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    if (p->glctx && p->glctx->driver->wakeup)
-        p->glctx->driver->wakeup(p->glctx);
-}
-
-static void wait_events(struct vo *vo, int64_t until_time_us)
-{
-    struct gl_priv *p = vo->priv;
-    if (p->glctx->driver->wait_events) {
-        p->glctx->driver->wait_events(p->glctx, until_time_us);
-    } else {
-        vo_wait_default(vo, until_time_us);
-    }
-}
-
-static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
-                                  int stride_align)
-{
-    struct gl_priv *p = vo->priv;
-
-    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
-}
-
-static void uninit(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-
-    gl_video_uninit(p->renderer);
-    ra_hwdec_uninit(p->hwdec);
-    if (vo->hwdec_devs) {
-        hwdec_devices_set_loader(vo->hwdec_devs, NULL, NULL);
-        hwdec_devices_destroy(vo->hwdec_devs);
-    }
-    ra_free(&p->ra);
-    mpgl_uninit(p->glctx);
-}
-
-static int preinit(struct vo *vo)
-{
-    struct gl_priv *p = vo->priv;
-    p->vo = vo;
-    p->log = vo->log;
-
-    int vo_flags = 0;
-
-    int alpha_mode;
-    mp_read_option_raw(vo->global, "alpha", &m_option_type_choice, &alpha_mode);
-
-    if (alpha_mode == 1)
-        vo_flags |= VOFLAG_ALPHA;
-
-    if (p->opts.use_gl_debug)
-        vo_flags |= VOFLAG_GL_DEBUG;
-
-    if (p->opts.es == 1)
-        vo_flags |= VOFLAG_GLES;
-    if (p->opts.es == 2)
-        vo_flags |= VOFLAG_GLES | VOFLAG_GLES2;
-    if (p->opts.es == -1)
-        vo_flags |= VOFLAG_NO_GLES;
-
-    if (p->opts.allow_sw)
-        vo_flags |= VOFLAG_SW;
-
-    p->glctx = mpgl_init(vo, p->opts.backend, vo_flags);
-    if (!p->glctx)
-        goto err_out;
-    p->gl = p->glctx->gl;
-
-    if (p->gl->SwapInterval) {
-        p->gl->SwapInterval(p->opts.swap_interval);
-    } else {
-        MP_VERBOSE(vo, "swap_control extension missing.\n");
-    }
-
-    p->ra = ra_create_gl(p->gl, vo->log);
-    if (!p->ra)
-        goto err_out;
-
-    p->renderer = gl_video_init(p->ra, vo->log, vo->global);
-    gl_video_set_osd_source(p->renderer, vo->osd);
-    gl_video_configure_queue(p->renderer, vo);
-
-    get_and_update_icc_profile(p);
-
-    vo->hwdec_devs = hwdec_devices_create();
-
-    hwdec_devices_set_loader(vo->hwdec_devs, call_request_hwdec_api, vo);
-
-    p->hwdec = ra_hwdec_load(p->vo->log, p->ra, vo->global,
-                             vo->hwdec_devs, vo->opts->gl_hwdec_interop);
-    gl_video_set_hwdec(p->renderer, p->hwdec);
-
-    gl_check_error(p->gl, p->log, "before retrieving framebuffer depth");
-    int fb_depth = gl_get_fb_depth(p->gl, p->glctx->main_fb);
-    gl_check_error(p->gl, p->log, "retrieving framebuffer depth");
-    if (fb_depth)
-        MP_VERBOSE(p, "Reported display depth: %d\n", fb_depth);
-    gl_video_set_fb_depth(p->renderer, fb_depth);
-
-    return 0;
-
-err_out:
-    uninit(vo);
-    return -1;
-}
-
-#define OPT_BASE_STRUCT struct gl_priv
-
-const struct vo_driver video_out_opengl = {
-    .description = "Extended OpenGL Renderer",
-    .name = "opengl",
-    .caps = VO_CAP_ROTATE90,
-    .preinit = preinit,
-    .query_format = query_format,
-    .reconfig = reconfig,
-    .control = control,
-    .get_image = get_image,
-    .draw_frame = draw_frame,
-    .flip_page = flip_page,
-    .wait_events = wait_events,
-    .wakeup = wakeup,
-    .uninit = uninit,
-    .priv_size = sizeof(struct gl_priv),
-    .options = (const m_option_t[]) {
-        OPT_FLAG("opengl-glfinish", opts.use_glFinish, 0),
-        OPT_FLAG("opengl-waitvsync", opts.waitvsync, 0),
-        OPT_INT("opengl-swapinterval", opts.swap_interval, 0),
-        OPT_FLAG("opengl-debug", opts.use_gl_debug, 0),
-        OPT_STRING_VALIDATE("opengl-backend", opts.backend, 0,
-                            mpgl_validate_backend_opt),
-        OPT_FLAG("opengl-sw", opts.allow_sw, 0),
-        OPT_CHOICE("opengl-es", opts.es, 0, ({"no", -1}, {"auto", 0},
-                                             {"yes", 1}, {"force2", 2})),
-        OPT_INTPAIR("opengl-check-pattern", opts.pattern, 0),
-        OPT_INTRANGE("opengl-vsync-fences", opts.vsync_fences, 0,
-                     0, NUM_VSYNC_FENCES),
-
-        {0}
-    },
-    .priv_defaults = &(const struct gl_priv){
-        .opts = {
-            .swap_interval = 1,
-        },
-    },
-};
diff --git a/video/out/vo_opengl_cb.c b/video/out/vo_opengl_cb.c
index ea6aaa9..c8dab15 100644
--- a/video/out/vo_opengl_cb.c
+++ b/video/out/vo_opengl_cb.c
@@ -24,9 +24,10 @@
 #include "common/global.h"
 #include "player/client.h"
 
+#include "gpu/video.h"
+#include "gpu/hwdec.h"
 #include "opengl/common.h"
-#include "opengl/video.h"
-#include "opengl/hwdec.h"
+#include "opengl/context.h"
 #include "opengl/ra_gl.h"
 
 #include "libmpv/opengl_cb.h"
@@ -86,9 +87,8 @@ struct mpv_opengl_cb_context {
     //     application's OpenGL context is current - i.e. only while the
     //     host application is calling certain mpv_opengl_cb_* APIs.
     GL *gl;
-    struct ra *ra;
+    struct ra_ctx *ra_ctx;
     struct gl_video *renderer;
-    struct ra_hwdec *hwdec;
     struct m_config_cache *vo_opts_cache;
     struct mp_vo_opts *vo_opts;
 };
@@ -171,18 +171,34 @@ int mpv_opengl_cb_init_gl(struct mpv_opengl_cb_context *ctx, const char *exts,
         return MPV_ERROR_UNSUPPORTED;
     }
 
-    ctx->ra = ra_create_gl(ctx->gl, ctx->log);
-    if (!ctx->ra)
-        return MPV_ERROR_UNSUPPORTED;
+    // initialize a blank ra_ctx to reuse ra_gl_ctx
+    ctx->ra_ctx = talloc_zero(ctx, struct ra_ctx);
+    ctx->ra_ctx->log = ctx->log;
+    ctx->ra_ctx->global = ctx->global;
+    ctx->ra_ctx->opts = (struct ra_ctx_opts) {
+        .probing = false,
+        .allow_sw = true,
+    };
+
+    static const struct ra_swapchain_fns empty_swapchain_fns = {0};
+    struct ra_gl_ctx_params gl_params = {
+        // vo_opengl_cb is essentially like a gigantic external swapchain where
+        // the user is in charge of presentation / swapping etc. But we don't
+        // actually need to provide any of these functions, since we can just
+        // not call them to begin with - so just set it to an empty object to
+        // signal to ra_gl_ctx that we don't care about its latency emulation
+        // functionality
+        .external_swapchain = &empty_swapchain_fns
+    };
 
-    ctx->renderer = gl_video_init(ctx->ra, ctx->log, ctx->global);
+    ctx->gl->SwapInterval = NULL; // we shouldn't randomly change this, so lock it
+    if (!ra_gl_ctx_init(ctx->ra_ctx, ctx->gl, gl_params))
+        return MPV_ERROR_UNSUPPORTED;
 
-    m_config_cache_update(ctx->vo_opts_cache);
+    ctx->renderer = gl_video_init(ctx->ra_ctx->ra, ctx->log, ctx->global);
 
     ctx->hwdec_devs = hwdec_devices_create();
-    ctx->hwdec = ra_hwdec_load(ctx->log, ctx->ra, ctx->global,
-                               ctx->hwdec_devs, ctx->vo_opts->gl_hwdec_interop);
-    gl_video_set_hwdec(ctx->renderer, ctx->hwdec);
+    gl_video_load_hwdecs(ctx->renderer, ctx->hwdec_devs, true);
 
     pthread_mutex_lock(&ctx->lock);
     for (int n = IMGFMT_START; n < IMGFMT_END; n++) {
@@ -217,12 +233,12 @@ int mpv_opengl_cb_uninit_gl(struct mpv_opengl_cb_context *ctx)
 
     gl_video_uninit(ctx->renderer);
     ctx->renderer = NULL;
-    ra_hwdec_uninit(ctx->hwdec);
-    ctx->hwdec = NULL;
     hwdec_devices_destroy(ctx->hwdec_devs);
     ctx->hwdec_devs = NULL;
-    ra_free(&ctx->ra);
+    ra_gl_ctx_uninit(ctx->ra_ctx);
+    talloc_free(ctx->ra_ctx);
     talloc_free(ctx->gl);
+    ctx->ra_ctx = NULL;
     ctx->gl = NULL;
     return 0;
 }
@@ -236,11 +252,6 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         return MPV_ERROR_UNSUPPORTED;
     }
 
-    struct fbodst target = {
-        .tex = ra_create_wrapped_fb(ctx->ra, fbo, vp_w, abs(vp_h)),
-        .flip = vp_h < 0,
-    };
-
     reset_gl_state(ctx->gl);
 
     pthread_mutex_lock(&ctx->lock);
@@ -273,14 +284,13 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         gl_video_config(ctx->renderer, &ctx->img_params);
     }
     if (ctx->update_new_opts) {
-        gl_video_update_options(ctx->renderer);
         if (vo)
             gl_video_configure_queue(ctx->renderer, vo);
         int debug;
-        mp_read_option_raw(ctx->global, "opengl-debug", &m_option_type_flag,
+        mp_read_option_raw(ctx->global, "gpu-debug", &m_option_type_flag,
                            &debug);
         ctx->gl->debug_context = debug;
-        ra_gl_set_debug(ctx->ra, debug);
+        ra_gl_set_debug(ctx->ra_ctx->ra, debug);
         if (gl_video_icc_auto_enabled(ctx->renderer))
             MP_ERR(ctx, "icc-profile-auto is not available with opengl-cb\n");
     }
@@ -316,7 +326,13 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
     pthread_mutex_unlock(&ctx->lock);
 
     MP_STATS(ctx, "glcb-render");
+    struct ra_swapchain *sw = ctx->ra_ctx->swapchain;
+    struct ra_fbo target;
+    ra_gl_ctx_resize(sw, vp_w, abs(vp_h), fbo);
+    ra_gl_ctx_start_frame(sw, &target);
+    target.flip = vp_h < 0;
     gl_video_render_frame(ctx->renderer, frame, target);
+    ra_gl_ctx_submit_frame(sw, frame);
 
     reset_gl_state(ctx->gl);
 
@@ -328,8 +344,6 @@ int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
         pthread_cond_wait(&ctx->wakeup, &ctx->lock);
     pthread_mutex_unlock(&ctx->lock);
 
-    ra_tex_free(ctx->ra, &target.tex);
-
     return 0;
 }
 
diff --git a/video/out/vo_rpi.c b/video/out/vo_rpi.c
index 5b5d62c..4322a3f 100644
--- a/video/out/vo_rpi.c
+++ b/video/out/vo_rpi.c
@@ -44,7 +44,7 @@
 #include "sub/osd.h"
 
 #include "opengl/ra_gl.h"
-#include "opengl/video.h"
+#include "gpu/video.h"
 
 struct mp_egl_rpi {
     struct mp_log *log;
@@ -261,7 +261,7 @@ static void update_osd(struct vo *vo)
     MP_STATS(vo, "start rpi_osd");
 
     struct vo_frame frame = {0};
-    struct fbodst target = {
+    struct ra_fbo target = {
         .tex = ra_create_wrapped_fb(p->egl.ra, 0, p->osd_res.w, p->osd_res.h),
         .flip = true,
     };
diff --git a/video/out/vo_vaapi.c b/video/out/vo_vaapi.c
index 3468ac6..a3f7015 100644
--- a/video/out/vo_vaapi.c
+++ b/video/out/vo_vaapi.c
@@ -96,6 +96,8 @@ struct priv {
     VADisplayAttribute      *va_display_attrs;
     int                     *mp_display_attr;
     int                      va_num_display_attrs;
+
+    struct va_image_formats *image_formats;
 };
 
 #define OSD_VA_FORMAT VA_FOURCC_BGRA
@@ -108,6 +110,306 @@ static const bool osd_formats[SUBBITMAP_COUNT] = {
 
 static void draw_osd(struct vo *vo);
 
+
+struct fmtentry {
+    uint32_t va;
+    enum mp_imgfmt mp;
+};
+
+static const struct fmtentry va_to_imgfmt[] = {
+    {VA_FOURCC_NV12, IMGFMT_NV12},
+    {VA_FOURCC_YV12, IMGFMT_420P},
+    {VA_FOURCC_IYUV, IMGFMT_420P},
+    {VA_FOURCC_UYVY, IMGFMT_UYVY},
+    // Note: not sure about endian issues (the mp formats are byte-addressed)
+    {VA_FOURCC_RGBA, IMGFMT_RGBA},
+    {VA_FOURCC_RGBX, IMGFMT_RGBA},
+    {VA_FOURCC_BGRA, IMGFMT_BGRA},
+    {VA_FOURCC_BGRX, IMGFMT_BGRA},
+    {0             , IMGFMT_NONE}
+};
+
+static enum mp_imgfmt va_fourcc_to_imgfmt(uint32_t fourcc)
+{
+    for (const struct fmtentry *entry = va_to_imgfmt; entry->va; ++entry) {
+        if (entry->va == fourcc)
+            return entry->mp;
+    }
+    return IMGFMT_NONE;
+}
+
+static uint32_t va_fourcc_from_imgfmt(int imgfmt)
+{
+    for (const struct fmtentry *entry = va_to_imgfmt; entry->va; ++entry) {
+        if (entry->mp == imgfmt)
+            return entry->va;
+    }
+    return 0;
+}
+
+struct va_image_formats {
+    VAImageFormat *entries;
+    int num;
+};
+
+static void va_get_formats(struct priv *ctx)
+{
+    struct va_image_formats *formats = talloc_ptrtype(ctx, formats);
+    formats->num = vaMaxNumImageFormats(ctx->display);
+    formats->entries = talloc_array(formats, VAImageFormat, formats->num);
+    VAStatus status = vaQueryImageFormats(ctx->display, formats->entries,
+                                          &formats->num);
+    if (!CHECK_VA_STATUS(ctx, "vaQueryImageFormats()"))
+        return;
+    MP_VERBOSE(ctx, "%d image formats available:\n", formats->num);
+    for (int i = 0; i < formats->num; i++)
+        MP_VERBOSE(ctx, "  %s\n", mp_tag_str(formats->entries[i].fourcc));
+    ctx->image_formats = formats;
+}
+
+static VAImageFormat *va_image_format_from_imgfmt(struct priv *ctx,
+                                                  int imgfmt)
+{
+    struct va_image_formats *formats = ctx->image_formats;
+    const int fourcc = va_fourcc_from_imgfmt(imgfmt);
+    if (!formats || !formats->num || !fourcc)
+        return NULL;
+    for (int i = 0; i < formats->num; i++) {
+        if (formats->entries[i].fourcc == fourcc)
+            return &formats->entries[i];
+    }
+    return NULL;
+}
+
+struct va_surface {
+    struct mp_vaapi_ctx *ctx;
+    VADisplay display;
+
+    VASurfaceID id;
+    int rt_format;
+
+    // The actually allocated surface size (needed for cropping).
+    // mp_images can have a smaller size than this, which means they are
+    // cropped down to a smaller size by removing right/bottom pixels.
+    int w, h;
+
+    VAImage image;       // used for software decoding case
+    bool is_derived;     // is image derived by vaDeriveImage()?
+};
+
+static struct va_surface *va_surface_in_mp_image(struct mp_image *mpi)
+{
+    return mpi && mpi->imgfmt == IMGFMT_VAAPI ?
+        (struct va_surface*)mpi->planes[0] : NULL;
+}
+
+static void release_va_surface(void *arg)
+{
+    struct va_surface *surface = arg;
+
+    if (surface->id != VA_INVALID_ID) {
+        if (surface->image.image_id != VA_INVALID_ID)
+            vaDestroyImage(surface->display, surface->image.image_id);
+        vaDestroySurfaces(surface->display, &surface->id, 1);
+    }
+
+    talloc_free(surface);
+}
+
+static struct mp_image *alloc_surface(struct mp_vaapi_ctx *ctx, int rt_format,
+                                      int w, int h)
+{
+    VASurfaceID id = VA_INVALID_ID;
+    VAStatus status;
+    status = vaCreateSurfaces(ctx->display, rt_format, w, h, &id, 1, NULL, 0);
+    if (!CHECK_VA_STATUS(ctx, "vaCreateSurfaces()"))
+        return NULL;
+
+    struct va_surface *surface = talloc_ptrtype(NULL, surface);
+    if (!surface)
+        return NULL;
+
+    *surface = (struct va_surface){
+        .ctx = ctx,
+        .id = id,
+        .rt_format = rt_format,
+        .w = w,
+        .h = h,
+        .display = ctx->display,
+        .image = { .image_id = VA_INVALID_ID, .buf = VA_INVALID_ID },
+    };
+
+    struct mp_image img = {0};
+    mp_image_setfmt(&img, IMGFMT_VAAPI);
+    mp_image_set_size(&img, w, h);
+    img.planes[0] = (uint8_t*)surface;
+    img.planes[3] = (uint8_t*)(uintptr_t)surface->id;
+    return mp_image_new_custom_ref(&img, surface, release_va_surface);
+}
+
+static void va_surface_image_destroy(struct va_surface *surface)
+{
+    if (!surface || surface->image.image_id == VA_INVALID_ID)
+        return;
+    vaDestroyImage(surface->display, surface->image.image_id);
+    surface->image.image_id = VA_INVALID_ID;
+    surface->is_derived = false;
+}
+
+static int va_surface_image_alloc(struct va_surface *p, VAImageFormat *format)
+{
+    VADisplay *display = p->display;
+
+    if (p->image.image_id != VA_INVALID_ID &&
+        p->image.format.fourcc == format->fourcc)
+        return 0;
+
+    int r = 0;
+
+    va_surface_image_destroy(p);
+
+    VAStatus status = vaDeriveImage(display, p->id, &p->image);
+    if (status == VA_STATUS_SUCCESS) {
+        /* vaDeriveImage() is supported, check format */
+        if (p->image.format.fourcc == format->fourcc &&
+            p->image.width == p->w && p->image.height == p->h)
+        {
+            p->is_derived = true;
+            MP_TRACE(p->ctx, "Using vaDeriveImage()\n");
+        } else {
+            vaDestroyImage(p->display, p->image.image_id);
+            status = VA_STATUS_ERROR_OPERATION_FAILED;
+        }
+    }
+    if (status != VA_STATUS_SUCCESS) {
+        p->image.image_id = VA_INVALID_ID;
+        status = vaCreateImage(p->display, format, p->w, p->h, &p->image);
+        if (!CHECK_VA_STATUS(p->ctx, "vaCreateImage()")) {
+            p->image.image_id = VA_INVALID_ID;
+            r = -1;
+        }
+    }
+
+    return r;
+}
+
+// img must be a VAAPI surface; make sure its internal VAImage is allocated
+// to a format corresponding to imgfmt (or return an error).
+static int va_surface_alloc_imgfmt(struct priv *priv, struct mp_image *img,
+                                   int imgfmt)
+{
+    struct va_surface *p = va_surface_in_mp_image(img);
+    if (!p)
+        return -1;
+    // Multiple FourCCs can refer to the same imgfmt, so check by doing the
+    // surjective conversion first.
+    if (p->image.image_id != VA_INVALID_ID &&
+        va_fourcc_to_imgfmt(p->image.format.fourcc) == imgfmt)
+        return 0;
+    VAImageFormat *format = va_image_format_from_imgfmt(priv, imgfmt);
+    if (!format)
+        return -1;
+    if (va_surface_image_alloc(p, format) < 0)
+        return -1;
+    return 0;
+}
+
+static bool va_image_map(struct mp_vaapi_ctx *ctx, VAImage *image,
+                         struct mp_image *mpi)
+{
+    int imgfmt = va_fourcc_to_imgfmt(image->format.fourcc);
+    if (imgfmt == IMGFMT_NONE)
+        return false;
+    void *data = NULL;
+    const VAStatus status = vaMapBuffer(ctx->display, image->buf, &data);
+    if (!CHECK_VA_STATUS(ctx, "vaMapBuffer()"))
+        return false;
+
+    *mpi = (struct mp_image) {0};
+    mp_image_setfmt(mpi, imgfmt);
+    mp_image_set_size(mpi, image->width, image->height);
+
+    for (int p = 0; p < image->num_planes; p++) {
+        mpi->stride[p] = image->pitches[p];
+        mpi->planes[p] = (uint8_t *)data + image->offsets[p];
+    }
+
+    if (image->format.fourcc == VA_FOURCC_YV12) {
+        MPSWAP(int, mpi->stride[1], mpi->stride[2]);
+        MPSWAP(uint8_t *, mpi->planes[1], mpi->planes[2]);
+    }
+
+    return true;
+}
+
+static bool va_image_unmap(struct mp_vaapi_ctx *ctx, VAImage *image)
+{
+    const VAStatus status = vaUnmapBuffer(ctx->display, image->buf);
+    return CHECK_VA_STATUS(ctx, "vaUnmapBuffer()");
+}
+
+// va_dst: copy destination, must be IMGFMT_VAAPI
+// sw_src: copy source, must be a software pixel format
+static int va_surface_upload(struct priv *priv, struct mp_image *va_dst,
+                             struct mp_image *sw_src)
+{
+    struct va_surface *p = va_surface_in_mp_image(va_dst);
+    if (!p)
+        return -1;
+
+    if (va_surface_alloc_imgfmt(priv, va_dst, sw_src->imgfmt) < 0)
+        return -1;
+
+    struct mp_image img;
+    if (!va_image_map(p->ctx, &p->image, &img))
+        return -1;
+    assert(sw_src->w <= img.w && sw_src->h <= img.h);
+    mp_image_set_size(&img, sw_src->w, sw_src->h); // copy only visible part
+    mp_image_copy(&img, sw_src);
+    va_image_unmap(p->ctx, &p->image);
+
+    if (!p->is_derived) {
+        VAStatus status = vaPutImage(p->display, p->id,
+                                     p->image.image_id,
+                                     0, 0, sw_src->w, sw_src->h,
+                                     0, 0, sw_src->w, sw_src->h);
+        if (!CHECK_VA_STATUS(p->ctx, "vaPutImage()"))
+            return -1;
+    }
+
+    if (p->is_derived)
+        va_surface_image_destroy(p);
+    return 0;
+}
+
+struct pool_alloc_ctx {
+    struct mp_vaapi_ctx *vaapi;
+    int rt_format;
+};
+
+static struct mp_image *alloc_pool(void *pctx, int fmt, int w, int h)
+{
+    struct pool_alloc_ctx *alloc_ctx = pctx;
+    if (fmt != IMGFMT_VAAPI)
+        return NULL;
+
+    return alloc_surface(alloc_ctx->vaapi, alloc_ctx->rt_format, w, h);
+}
+
+// The allocator of the given image pool to allocate VAAPI surfaces, using
+// the given rt_format.
+static void va_pool_set_allocator(struct mp_image_pool *pool,
+                                  struct mp_vaapi_ctx *ctx, int rt_format)
+{
+    struct pool_alloc_ctx *alloc_ctx = talloc_ptrtype(pool, alloc_ctx);
+    *alloc_ctx = (struct pool_alloc_ctx){
+        .vaapi = ctx,
+        .rt_format = rt_format,
+    };
+    mp_image_pool_set_allocator(pool, alloc_pool, alloc_ctx);
+    mp_image_pool_set_lru(pool);
+}
+
 static void flush_output_surfaces(struct priv *p)
 {
     for (int n = 0; n < MAX_OUTPUT_SURFACES; n++)
@@ -135,7 +437,7 @@ static bool alloc_swdec_surfaces(struct priv *p, int w, int h, int imgfmt)
     free_video_specific(p);
     for (int i = 0; i < MAX_OUTPUT_SURFACES; i++) {
         p->swdec_surfaces[i] = mp_image_pool_get(p->pool, IMGFMT_VAAPI, w, h);
-        if (va_surface_alloc_imgfmt(p->swdec_surfaces[i], imgfmt) < 0)
+        if (va_surface_alloc_imgfmt(p, p->swdec_surfaces[i], imgfmt) < 0)
             return false;
     }
     return true;
@@ -172,7 +474,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
 static int query_format(struct vo *vo, int imgfmt)
 {
     struct priv *p = vo->priv;
-    if (imgfmt == IMGFMT_VAAPI || va_image_format_from_imgfmt(p->mpvaapi, imgfmt))
+    if (imgfmt == IMGFMT_VAAPI || va_image_format_from_imgfmt(p, imgfmt))
         return 1;
 
     return 0;
@@ -193,7 +495,7 @@ static bool render_to_screen(struct priv *p, struct mp_image *mpi)
                 struct mp_image *img = mp_image_alloc(fmt, w, h);
                 if (img) {
                     mp_image_clear(img, 0, 0, w, h);
-                    if (va_surface_upload(p->black_surface, img) < 0)
+                    if (va_surface_upload(p, p->black_surface, img) < 0)
                         mp_image_unrefp(&p->black_surface);
                     talloc_free(img);
                 }
@@ -268,7 +570,7 @@ static void draw_image(struct vo *vo, struct mp_image *mpi)
 
     if (mpi->imgfmt != IMGFMT_VAAPI) {
         struct mp_image *dst = p->swdec_surfaces[p->output_surface];
-        if (!dst || va_surface_upload(dst, mpi) < 0) {
+        if (!dst || va_surface_upload(p, dst, mpi) < 0) {
             MP_WARN(vo, "Could not upload surface.\n");
             talloc_free(mpi);
             return;
@@ -510,6 +812,10 @@ static int preinit(struct vo *vo)
                     "It's better to use VDPAU directly with: --vo=vdpau\n");
     }
 
+    va_get_formats(p);
+    if (!p->image_formats)
+        goto fail;
+
     p->pool = mp_image_pool_new(MAX_OUTPUT_SURFACES + 3);
     va_pool_set_allocator(p->pool, p->mpvaapi, VA_RT_FORMAT_YUV420);
 
diff --git a/video/out/vo_wayland.c b/video/out/vo_wayland.c
deleted file mode 100644
index 37ab4c7..0000000
--- a/video/out/vo_wayland.c
+++ /dev/null
@@ -1,682 +0,0 @@
-/*
- * This file is part of mpv video player.
- * Copyright © 2013 Alexander Preisinger <alexander.preisinger@gmail.com>
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <stdio.h>
-#include <stdbool.h>
-#include <assert.h>
-
-#include <libavutil/common.h>
-
-#include "config.h"
-
-#include "vo.h"
-#include "video/mp_image.h"
-#include "video/sws_utils.h"
-#include "sub/osd.h"
-#include "sub/img_convert.h"
-#include "common/msg.h"
-#include "input/input.h"
-#include "osdep/endian.h"
-#include "osdep/timer.h"
-
-#include "wayland_common.h"
-
-#include "video/out/wayland/buffer.h"
-
-static void draw_image(struct vo *vo, mp_image_t *mpi);
-static void draw_osd(struct vo *vo);
-
-static const struct wl_buffer_listener buffer_listener;
-
-// TODO: pay attention to the reported subpixel order
-static const format_t format_table[] = {
-    {WL_SHM_FORMAT_ARGB8888, IMGFMT_BGRA}, // 8b 8g 8r 8a
-    {WL_SHM_FORMAT_XRGB8888, IMGFMT_BGR0},
-#if BYTE_ORDER == LITTLE_ENDIAN
-    {WL_SHM_FORMAT_RGB565,   IMGFMT_RGB565}, // 5b 6g 5r
-#endif
-    {WL_SHM_FORMAT_RGB888,   IMGFMT_BGR24}, // 8b 8g 8r
-    {WL_SHM_FORMAT_BGR888,   IMGFMT_RGB24}, // 8r 8g 8b
-    {WL_SHM_FORMAT_XBGR8888, IMGFMT_RGB0},
-    {WL_SHM_FORMAT_RGBX8888, IMGFMT_0BGR},
-    {WL_SHM_FORMAT_BGRX8888, IMGFMT_0RGB},
-    {WL_SHM_FORMAT_ABGR8888, IMGFMT_RGBA},
-    {WL_SHM_FORMAT_RGBA8888, IMGFMT_ABGR},
-    {WL_SHM_FORMAT_BGRA8888, IMGFMT_ARGB},
-};
-
-#define MAX_FORMAT_ENTRIES (sizeof(format_table) / sizeof(format_table[0]))
-#define DEFAULT_FORMAT_ENTRY 1
-#define DEFAULT_ALPHA_FORMAT_ENTRY 0
-
-struct priv;
-
-// We only use double buffering but the creation and usage is still open to
-// triple buffering. Triple buffering is now removed, because double buffering
-// is now pixel-perfect.
-struct buffer_pool {
-    shm_buffer_t **buffers;
-    shm_buffer_t *front_buffer; // just pointers to any of the buffers
-    shm_buffer_t *back_buffer;
-    uint32_t buffer_no;
-};
-
-struct supported_format {
-    format_t format;
-    bool is_alpha;
-    struct wl_list link;
-};
-
-struct priv {
-    struct vo *vo;
-    struct vo_wayland_state *wl;
-
-    struct wl_list format_list;
-    const format_t *video_format; // pointer to element in supported_format list
-
-    struct mp_rect src;
-    struct mp_rect dst;
-    int src_w, src_h;
-    int dst_w, dst_h;
-    struct mp_osd_res osd;
-
-    struct mp_sws_context *sws;
-    struct mp_image_params in_format;
-
-    struct buffer_pool video_bufpool;
-
-    struct mp_image *original_image;
-    int width;  // width of the original image
-    int height;
-
-    int x, y; // coords for resizing
-
-    struct wl_surface *osd_surfaces[MAX_OSD_PARTS];
-    struct wl_subsurface *osd_subsurfaces[MAX_OSD_PARTS];
-    shm_buffer_t *osd_buffers[MAX_OSD_PARTS];
-    // this id tells us if the subtitle part has changed or not
-    int change_id[MAX_OSD_PARTS];
-
-    // options
-    int enable_alpha;
-    int use_rgb565;
-};
-
-static bool is_alpha_format(const format_t *fmt)
-{
-    return !!(mp_imgfmt_get_desc(fmt->mp_format).flags & MP_IMGFLAG_ALPHA);
-}
-
-static const format_t* is_wayland_format_supported(struct priv *p,
-                                                   enum wl_shm_format fmt)
-{
-    struct supported_format *sf;
-
-    // find the matching format first
-    wl_list_for_each(sf, &p->format_list, link) {
-        if (sf->format.wl_format == fmt) {
-            return &sf->format;
-        }
-    }
-
-    return NULL;
-}
-
-// additional buffer functions
-
-static void buffer_finalise_front(shm_buffer_t *buf)
-{
-    SHM_BUFFER_SET_BUSY(buf);
-    SHM_BUFFER_CLEAR_DIRTY(buf);
-}
-
-static void buffer_finalise_back(shm_buffer_t *buf)
-{
-    SHM_BUFFER_SET_DIRTY(buf);
-}
-
-static struct mp_image buffer_get_mp_image(struct priv *p,
-                                           shm_buffer_t *buf)
-{
-    struct mp_image img = {0};
-    mp_image_set_params(&img, &p->sws->dst);
-
-    img.w = buf->stride / buf->bytes;
-    img.h = buf->height;
-    img.planes[0] = buf->data;
-    img.stride[0] = buf->stride;
-
-    return img;
-}
-
-// buffer pool functions
-
-static void buffer_pool_reinit(struct priv *p,
-                               struct buffer_pool *pool,
-                               uint32_t buffer_no,
-                               uint32_t width, uint32_t height,
-                               format_t fmt,
-                               struct wl_shm *shm)
-{
-    if (!pool->buffers)
-        pool->buffers = calloc(buffer_no, sizeof(shm_buffer_t*));
-
-    pool->buffer_no = buffer_no;
-
-    for (uint32_t i = 0; i < buffer_no; ++i) {
-        if (pool->buffers[i] == NULL)
-            pool->buffers[i] = shm_buffer_create(width, height, fmt,
-                                                 shm, &buffer_listener);
-        else
-            shm_buffer_resize(pool->buffers[i], width, height);
-    }
-
-    pool->back_buffer = pool->buffers[0];
-    pool->front_buffer = pool->buffers[1];
-}
-
-static bool buffer_pool_resize(struct buffer_pool *pool,
-                               int width,
-                               int height)
-{
-    bool ret = true;
-
-    for (uint32_t i = 0; ret && i < pool->buffer_no; ++i)
-        shm_buffer_resize(pool->buffers[i], width, height);
-
-    return ret;
-}
-
-static void buffer_pool_destroy(struct buffer_pool *pool)
-{
-    for (uint32_t i = 0; i < pool->buffer_no; ++i)
-        shm_buffer_destroy(pool->buffers[i]);
-
-    free(pool->buffers);
-    pool->front_buffer = NULL;
-    pool->back_buffer = NULL;
-    pool->buffers = NULL;
-}
-
-static void buffer_pool_swap(struct buffer_pool *pool)
-{
-    if (SHM_BUFFER_IS_DIRTY(pool->back_buffer)) {
-        shm_buffer_t *tmp = pool->back_buffer;
-        pool->back_buffer = pool->front_buffer;
-        pool->front_buffer = tmp;
-    }
-}
-
-// returns NULL if the back buffer is busy
-static shm_buffer_t * buffer_pool_get_back(struct buffer_pool *pool)
-{
-    if (!pool->back_buffer || SHM_BUFFER_IS_BUSY(pool->back_buffer))
-        return NULL;
-
-    return pool->back_buffer;
-}
-
-static shm_buffer_t * buffer_pool_get_front(struct buffer_pool *pool)
-{
-    return pool->front_buffer;
-}
-
-static bool redraw_frame(struct priv *p)
-{
-    draw_image(p->vo, NULL);
-    return true;
-}
-
-static bool resize(struct priv *p)
-{
-    struct vo_wayland_state *wl = p->wl;
-
-    if (!p->video_bufpool.back_buffer || SHM_BUFFER_IS_BUSY(p->video_bufpool.back_buffer))
-        return false; // skip resizing if we can't guarantee pixel perfectness!
-
-    int32_t scale = 1;
-    int32_t x = wl->window.sh_x;
-    int32_t y = wl->window.sh_y;
-
-    if (wl->display.current_output)
-        scale = wl->display.current_output->scale;
-
-    wl->vo->dwidth = scale*wl->window.sh_width;
-    wl->vo->dheight = scale*wl->window.sh_height;
-
-    vo_get_src_dst_rects(p->vo, &p->src, &p->dst, &p->osd);
-    p->src_w = p->src.x1 - p->src.x0;
-    p->src_h = p->src.y1 - p->src.y0;
-    p->dst_w = p->dst.x1 - p->dst.x0;
-    p->dst_h = p->dst.y1 - p->dst.y0;
-
-    mp_input_set_mouse_transform(p->vo->input_ctx, &p->dst, NULL);
-
-    MP_DBG(wl, "resizing %dx%d -> %dx%d\n", wl->window.width,
-                                            wl->window.height,
-                                            p->dst_w,
-                                            p->dst_h);
-
-    if (x != 0)
-        x = wl->window.width - p->dst_w;
-
-    if (y != 0)
-        y = wl->window.height - p->dst_h;
-
-    wl_surface_set_buffer_scale(wl->window.video_surface, scale);
-    mp_sws_set_from_cmdline(p->sws, p->vo->opts->sws_opts);
-    p->sws->src = p->in_format;
-    p->sws->dst = (struct mp_image_params) {
-        .imgfmt = p->video_format->mp_format,
-        .w = p->dst_w,
-        .h = p->dst_h,
-        .p_w = 1,
-        .p_h = 1,
-    };
-
-    mp_image_params_guess_csp(&p->sws->dst);
-
-    if (mp_sws_reinit(p->sws) < 0)
-        return false;
-
-    if (!buffer_pool_resize(&p->video_bufpool, p->dst_w, p->dst_h)) {
-        MP_ERR(wl, "failed to resize video buffers\n");
-        return false;
-    }
-
-    wl->window.width = p->dst_w;
-    wl->window.height = p->dst_h;
-
-    // if no alpha enabled format is used then create an opaque region to allow
-    // the compositor to optimize the drawing of the window
-    if (!p->enable_alpha) {
-        struct wl_region *opaque =
-            wl_compositor_create_region(wl->display.compositor);
-        wl_region_add(opaque, 0, 0, p->dst_w/scale, p->dst_h/scale);
-        wl_surface_set_opaque_region(wl->window.video_surface, opaque);
-        wl_region_destroy(opaque);
-    }
-
-    p->x = x;
-    p->y = y;
-    p->vo->want_redraw = true;
-    return true;
-}
-
-
-/* wayland listeners */
-
-static void buffer_handle_release(void *data, struct wl_buffer *buffer)
-{
-    shm_buffer_t *buf = data;
-
-    if (SHM_BUFFER_IS_ONESHOT(buf)) {
-        shm_buffer_destroy(buf);
-        return;
-    }
-
-    SHM_BUFFER_CLEAR_BUSY(buf);
-    // does nothing and returns 0 if no pending resize flag was set
-    shm_buffer_pending_resize(buf);
-}
-
-static const struct wl_buffer_listener buffer_listener = {
-    buffer_handle_release
-};
-
-static void shm_handle_format(void *data,
-                              struct wl_shm *wl_shm,
-                              uint32_t format)
-{
-    struct priv *p = data;
-    for (uint32_t i = 0; i < MAX_FORMAT_ENTRIES; ++i) {
-        if (format_table[i].wl_format == format) {
-            MP_INFO(p->wl, "format %s supported by hw\n",
-                    mp_imgfmt_to_name(format_table[i].mp_format));
-            struct supported_format *sf = talloc(p, struct supported_format);
-            sf->format = format_table[i];
-            sf->is_alpha = is_alpha_format(&sf->format);
-            wl_list_insert(&p->format_list, &sf->link);
-        }
-    }
-}
-
-static const struct wl_shm_listener shm_listener = {
-    shm_handle_format
-};
-
-
-/* mpv interface */
-
-static void draw_image(struct vo *vo, mp_image_t *mpi)
-{
-    struct priv *p = vo->priv;
-
-    if (mpi) {
-        talloc_free(p->original_image);
-        p->original_image = mpi;
-    }
-
-    vo_wayland_wait_events(vo, 0);
-
-    shm_buffer_t *buf = buffer_pool_get_back(&p->video_bufpool);
-
-    if (!buf) {
-        MP_VERBOSE(p->wl, "can't draw, back buffer is busy\n");
-        return;
-    }
-
-    struct mp_image img = buffer_get_mp_image(p, buf);
-
-    if (p->original_image) {
-        struct mp_image src = *p->original_image;
-        struct mp_rect src_rc = p->src;
-        src_rc.x0 = MP_ALIGN_DOWN(src_rc.x0, src.fmt.align_x);
-        src_rc.y0 = MP_ALIGN_DOWN(src_rc.y0, src.fmt.align_y);
-        mp_image_crop_rc(&src, src_rc);
-
-        mp_sws_scale(p->sws, &img, &src);
-    } else {
-        mp_image_clear(&img, 0, 0, img.w, img.h);
-    }
-
-    buffer_finalise_back(buf);
-
-    draw_osd(vo);
-}
-
-static void draw_osd_cb(void *ctx, struct sub_bitmaps *imgs)
-{
-    struct priv *p = ctx;
-    int id = imgs->render_index;
-
-    struct wl_surface *s = p->osd_surfaces[id];
-
-    if (imgs->change_id != p->change_id[id]) {
-        p->change_id[id] = imgs->change_id;
-
-        struct mp_rect bb;
-        if (!mp_sub_bitmaps_bb(imgs, &bb))
-            return;
-
-        int width = mp_rect_w(bb);
-        int height = mp_rect_h(bb);
-
-        if (!p->osd_buffers[id]) {
-            p->osd_buffers[id] = shm_buffer_create(width,
-                                                   height,
-                                                   format_table[DEFAULT_ALPHA_FORMAT_ENTRY],
-                                                   p->wl->display.shm,
-                                                   &buffer_listener);
-        }
-        else if (SHM_BUFFER_IS_BUSY(p->osd_buffers[id])) {
-            // freed on release in buffer_listener
-            // guarantees pixel perfect resizing of subtitles and osd
-            SHM_BUFFER_SET_ONESHOT(p->osd_buffers[id]);
-            p->osd_buffers[id] = shm_buffer_create(width,
-                                                   height,
-                                                   format_table[DEFAULT_ALPHA_FORMAT_ENTRY],
-                                                   p->wl->display.shm,
-                                                   &buffer_listener);
-        }
-        else {
-            shm_buffer_resize(p->osd_buffers[id], width, height);
-        }
-
-        shm_buffer_t *buf = p->osd_buffers[id];
-        SHM_BUFFER_SET_BUSY(buf);
-
-        struct mp_image wlimg = buffer_get_mp_image(p, buf);
-
-        for (int n = 0; n < imgs->num_parts; n++) {
-            struct sub_bitmap *sub = &imgs->parts[n];
-            memcpy_pic(wlimg.planes[0], sub->bitmap, sub->w * 4, sub->h,
-                       wlimg.stride[0], sub->stride);
-        }
-
-        wl_subsurface_set_position(p->osd_subsurfaces[id], 0, 0);
-        wl_surface_attach(s, buf->buffer, bb.x0, bb.y0);
-        wl_surface_damage(s, 0, 0, width, height);
-        wl_surface_commit(s);
-    }
-    else {
-        // p->osd_buffer, guaranteed to exist here
-        assert(p->osd_buffers[id]);
-        wl_surface_attach(s, p->osd_buffers[id]->buffer, 0, 0);
-        wl_surface_commit(s);
-    }
-}
-
-static const bool osd_formats[SUBBITMAP_COUNT] = {
-    [SUBBITMAP_RGBA] = true,
-};
-
-static void draw_osd(struct vo *vo)
-{
-    int32_t scale = 1;
-    struct priv *p = vo->priv;
-
-    if (p->wl && p->wl->display.current_output)
-        scale = p->wl->display.current_output->scale;
-
-    // detach all buffers and attach all needed buffers in osd_draw
-    // only the most recent attach & commit is applied once the parent surface
-    // is committed
-    for (int i = 0; i < MAX_OSD_PARTS; ++i) {
-        struct wl_surface *s = p->osd_surfaces[i];
-        wl_surface_attach(s, NULL, 0, 0);
-        wl_surface_set_buffer_scale(s, scale);
-        wl_surface_damage(s, 0, 0, p->dst_w, p->dst_h);
-        wl_surface_commit(s);
-    }
-
-    double pts = p->original_image ? p->original_image->pts : 0;
-    osd_draw(vo->osd, p->osd, pts, 0, osd_formats, draw_osd_cb, p);
-}
-
-static void redraw(void *data, uint32_t time)
-{
-    struct priv *p = data;
-
-    shm_buffer_t *buf = buffer_pool_get_front(&p->video_bufpool);
-    wl_surface_attach(p->wl->window.video_surface, buf->buffer, p->x, p->y);
-    wl_surface_damage(p->wl->window.video_surface, 0, 0, p->dst_w, p->dst_h);
-    buffer_finalise_front(buf);
-
-    p->x = 0;
-    p->y = 0;
-}
-
-static void flip_page(struct vo *vo)
-{
-    struct priv *p = vo->priv;
-
-    buffer_pool_swap(&p->video_bufpool);
-
-    if (!p->wl->frame.callback)
-        vo_wayland_request_frame(vo, p, redraw);
-
-    vo_wayland_wait_events(vo, 0);
-}
-
-static int query_format(struct vo *vo, int format)
-{
-    struct priv *p = vo->priv;
-    struct supported_format *sf;
-    wl_list_for_each_reverse(sf, &p->format_list, link) {
-        if (sf->format.mp_format == format)
-            return 1;
-    }
-
-    if (mp_sws_supported_format(format))
-        return 1;
-
-    return 0;
-}
-
-static int reconfig(struct vo *vo, struct mp_image_params *fmt)
-{
-    struct priv *p = vo->priv;
-    mp_image_unrefp(&p->original_image);
-
-    p->width = fmt->w;
-    p->height = fmt->h;
-    p->in_format = *fmt;
-
-    struct supported_format *sf;
-
-    // find the matching format first
-    wl_list_for_each(sf, &p->format_list, link) {
-        if (sf->format.mp_format == fmt->imgfmt &&
-            (p->enable_alpha == sf->is_alpha))
-        {
-            p->video_format = &sf->format;
-            break;
-        }
-    }
-
-    if (!p->video_format) {
-        // if use default is enable overwrite the auto selected one
-        if (p->enable_alpha)
-            p->video_format = &format_table[DEFAULT_ALPHA_FORMAT_ENTRY];
-        else
-            p->video_format = &format_table[DEFAULT_FORMAT_ENTRY];
-    }
-
-    // overrides alpha
-    // use rgb565 if performance is your main concern
-    if (p->use_rgb565) {
-        MP_INFO(p->wl, "using rgb565\n");
-        const format_t *entry =
-            is_wayland_format_supported(p, WL_SHM_FORMAT_RGB565);
-        if (entry)
-            p->video_format = entry;
-    }
-
-    buffer_pool_reinit(p, &p->video_bufpool, 2, p->width, p->height,
-                       *p->video_format, p->wl->display.shm);
-
-    vo_wayland_config(vo);
-
-    resize(p);
-
-    return 0;
-}
-
-static void uninit(struct vo *vo)
-{
-    struct priv *p = vo->priv;
-    buffer_pool_destroy(&p->video_bufpool);
-
-    talloc_free(p->original_image);
-
-    for (int i = 0; i < MAX_OSD_PARTS; ++i) {
-        shm_buffer_destroy(p->osd_buffers[i]);
-        wl_subsurface_destroy(p->osd_subsurfaces[i]);
-        wl_surface_destroy(p->osd_surfaces[i]);
-    }
-
-    vo_wayland_uninit(vo);
-}
-
-static int preinit(struct vo *vo)
-{
-    struct priv *p = vo->priv;
-    struct vo_wayland_state *wl = NULL;
-
-    if (!vo_wayland_init(vo))
-        return -1;
-
-    wl = vo->wayland;
-
-    p->vo = vo;
-    p->wl = wl;
-    p->sws = mp_sws_alloc(vo);
-
-    wl_list_init(&p->format_list);
-
-    wl_shm_add_listener(wl->display.shm, &shm_listener, p);
-    wl_display_dispatch(wl->display.display);
-
-    // Commits on surfaces bound to a subsurface are cached until the parent
-    // surface is committed, in this case the video surface.
-    // Which means we can call commit anywhere.
-    struct wl_region *input =
-        wl_compositor_create_region(wl->display.compositor);
-    for (int i = 0; i < MAX_OSD_PARTS; ++i) {
-        p->osd_surfaces[i] =
-            wl_compositor_create_surface(wl->display.compositor);
-        wl_surface_attach(p->osd_surfaces[i], NULL, 0, 0);
-        wl_surface_set_input_region(p->osd_surfaces[i], input);
-        p->osd_subsurfaces[i] =
-            wl_subcompositor_get_subsurface(wl->display.subcomp,
-                                            p->osd_surfaces[i],
-                                            wl->window.video_surface); // parent
-        wl_surface_commit(p->osd_surfaces[i]);
-        wl_subsurface_set_sync(p->osd_subsurfaces[i]);
-    }
-    wl_region_destroy(input);
-
-    return 0;
-}
-
-static int control(struct vo *vo, uint32_t request, void *data)
-{
-    struct priv *p = vo->priv;
-    switch (request) {
-    case VOCTRL_SET_PANSCAN: {
-        resize(p);
-        return VO_TRUE;
-    }
-    case VOCTRL_REDRAW_FRAME:
-        return redraw_frame(p);
-    }
-    int events = 0;
-    int r = vo_wayland_control(vo, &events, request, data);
-
-    // NOTE: VO_EVENT_EXPOSE is never returned by the wayland backend
-    if (events & VO_EVENT_RESIZE)
-        resize(p);
-
-    vo_event(vo, events);
-
-    return r;
-}
-
-#define OPT_BASE_STRUCT struct priv
-const struct vo_driver video_out_wayland = {
-    .description = "Wayland SHM video output",
-    .name = "wayland",
-    .priv_size = sizeof(struct priv),
-    .preinit = preinit,
-    .query_format = query_format,
-    .reconfig = reconfig,
-    .control = control,
-    .draw_image = draw_image,
-    .flip_page = flip_page,
-    .wakeup = vo_wayland_wakeup,
-    .wait_events = vo_wayland_wait_events,
-    .uninit = uninit,
-    .options = (const struct m_option[]) {
-        OPT_FLAG("alpha", enable_alpha, 0),
-        OPT_FLAG("rgb565", use_rgb565, 0),
-        {0}
-    },
-    .options_prefix = "vo-wayland",
-};
-
diff --git a/video/out/vo_x11.c b/video/out/vo_x11.c
index dd2d942..f29d06a 100644
--- a/video/out/vo_x11.c
+++ b/video/out/vo_x11.c
@@ -37,11 +37,9 @@
 
 #include "x11_common.h"
 
-#if HAVE_SHM
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <X11/extensions/XShm.h>
-#endif
 
 #include "sub/osd.h"
 #include "sub/draw_bmp.h"
@@ -79,11 +77,9 @@ struct priv {
     int current_buf;
     bool reset_view;
 
-#if HAVE_SHM
     int Shmem_Flag;
     XShmSegmentInfo Shminfo[2];
     int Shm_Warned_Slow;
-#endif
 };
 
 static bool resize(struct vo *vo);
@@ -91,7 +87,6 @@ static bool resize(struct vo *vo);
 static bool getMyXImage(struct priv *p, int foo)
 {
     struct vo *vo = p->vo;
-#if HAVE_SHM
     if (vo->x11->display_is_local && XShmQueryExtension(vo->x11->display)) {
         p->Shmem_Flag = 1;
         vo->x11->ShmCompletionEvent = XShmGetEventBase(vo->x11->display)
@@ -136,34 +131,29 @@ static bool getMyXImage(struct priv *p, int foo)
     } else {
 shmemerror:
         p->Shmem_Flag = 0;
-#endif
-    MP_VERBOSE(vo, "Not using SHM.\n");
-    p->myximage[foo] =
-        XCreateImage(vo->x11->display, p->vinfo.visual, p->depth, ZPixmap,
-                     0, NULL, p->image_width, p->image_height, 8, 0);
-    if (!p->myximage[foo]) {
-        MP_WARN(vo, "could not allocate image");
-        return false;
+
+        MP_VERBOSE(vo, "Not using SHM.\n");
+        p->myximage[foo] =
+            XCreateImage(vo->x11->display, p->vinfo.visual, p->depth, ZPixmap,
+                         0, NULL, p->image_width, p->image_height, 8, 0);
+        if (!p->myximage[foo]) {
+            MP_WARN(vo, "could not allocate image");
+            return false;
+        }
+        p->myximage[foo]->data =
+            calloc(1, p->myximage[foo]->bytes_per_line * p->image_height + 32);
     }
-    p->myximage[foo]->data =
-        calloc(1, p->myximage[foo]->bytes_per_line * p->image_height + 32);
-#if HAVE_SHM
-}
-#endif
     return true;
 }
 
 static void freeMyXImage(struct priv *p, int foo)
 {
-#if HAVE_SHM
     struct vo *vo = p->vo;
     if (p->Shmem_Flag) {
         XShmDetach(vo->x11->display, &p->Shminfo[foo]);
         XDestroyImage(p->myximage[foo]);
         shmdt(p->Shminfo[foo].shmaddr);
-    } else
-#endif
-    {
+    } else {
         if (p->myximage[foo])
             XDestroyImage(p->myximage[foo]);
     }
@@ -284,15 +274,12 @@ static void Display_Image(struct priv *p, XImage *myximage)
         p->reset_view = false;
     }
 
-#if HAVE_SHM
     if (p->Shmem_Flag) {
         XShmPutImage(vo->x11->display, vo->x11->window, p->gc, x_image,
                      0, 0, p->dst.x0, p->dst.y0, p->dst_w, p->dst_h,
                      True);
         vo->x11->ShmCompletionWaitCount++;
-    } else
-#endif
-    {
+    } else {
         XPutImage(vo->x11->display, vo->x11->window, p->gc, x_image,
                   0, 0, p->dst.x0, p->dst.y0, p->dst_w, p->dst_h);
     }
@@ -312,7 +299,6 @@ static struct mp_image get_x_buffer(struct priv *p, int buf_index)
 
 static void wait_for_completion(struct vo *vo, int max_outstanding)
 {
-#if HAVE_SHM
     struct priv *ctx = vo->priv;
     struct vo_x11_state *x11 = vo->x11;
     if (ctx->Shmem_Flag) {
@@ -326,7 +312,6 @@ static void wait_for_completion(struct vo *vo, int max_outstanding)
             vo_x11_check_events(vo);
         }
     }
-#endif
 }
 
 static void flip_page(struct vo *vo)
diff --git a/video/out/vo_xv.c b/video/out/vo_xv.c
index 7c710f2..e75a653 100644
--- a/video/out/vo_xv.c
+++ b/video/out/vo_xv.c
@@ -30,12 +30,10 @@
 
 #include "config.h"
 
-#if HAVE_SHM
 #include <sys/types.h>
 #include <sys/ipc.h>
 #include <sys/shm.h>
 #include <X11/extensions/XShm.h>
-#endif
 
 // Note: depends on the inclusion of X11/extensions/XShm.h
 #include <X11/extensions/Xv.h>
@@ -92,10 +90,8 @@ struct xvctx {
     GC f_gc;    // used to paint background
     GC vo_gc;   // used to paint video
     int Shmem_Flag;
-#if HAVE_SHM
     XShmSegmentInfo Shminfo[MAX_BUFFERS];
     int Shm_Warned_Slow;
-#endif
 };
 
 #define MP_FOURCC(a,b,c,d) ((a) | ((b)<<8) | ((c)<<16) | ((unsigned)(d)<<24))
@@ -542,7 +538,6 @@ static bool allocate_xvimage(struct vo *vo, int foo)
     int aligned_w = FFALIGN(ctx->image_width, 32);
     // round up the height to next chroma boundary too
     int aligned_h = FFALIGN(ctx->image_height, 2);
-#if HAVE_SHM
     if (x11->display_is_local && XShmQueryExtension(x11->display)) {
         ctx->Shmem_Flag = 1;
         x11->ShmCompletionEvent = XShmGetEventBase(x11->display)
@@ -572,9 +567,7 @@ static bool allocate_xvimage(struct vo *vo, int foo)
         XShmAttach(x11->display, &ctx->Shminfo[foo]);
         XSync(x11->display, False);
         shmctl(ctx->Shminfo[foo].shmid, IPC_RMID, 0);
-    } else
-#endif
-    {
+    } else {
         ctx->xvimage[foo] =
             (XvImage *) XvCreateImage(x11->display, ctx->xv_port,
                                       ctx->xv_format, NULL, aligned_w,
@@ -604,22 +597,17 @@ static bool allocate_xvimage(struct vo *vo, int foo)
 static void deallocate_xvimage(struct vo *vo, int foo)
 {
     struct xvctx *ctx = vo->priv;
-#if HAVE_SHM
     if (ctx->Shmem_Flag) {
         XShmDetach(vo->x11->display, &ctx->Shminfo[foo]);
         shmdt(ctx->Shminfo[foo].shmaddr);
-    } else
-#endif
-    {
+    } else {
         av_free(ctx->xvimage[foo]->data);
     }
     if (ctx->xvimage[foo])
         XFree(ctx->xvimage[foo]);
 
     ctx->xvimage[foo] = NULL;
-#if HAVE_SHM
     ctx->Shminfo[foo] = (XShmSegmentInfo){0};
-#endif
 
     XSync(vo->x11->display, False);
     return;
@@ -633,16 +621,14 @@ static inline void put_xvimage(struct vo *vo, XvImage *xvi)
     struct mp_rect *dst = &ctx->dst_rect;
     int dw = dst->x1 - dst->x0, dh = dst->y1 - dst->y0;
     int sw = src->x1 - src->x0, sh = src->y1 - src->y0;
-#if HAVE_SHM
+
     if (ctx->Shmem_Flag) {
         XvShmPutImage(x11->display, ctx->xv_port, x11->window, ctx->vo_gc, xvi,
                       src->x0, src->y0, sw, sh,
                       dst->x0, dst->y0, dw, dh,
                       True);
         x11->ShmCompletionWaitCount++;
-    } else
-#endif
-    {
+    } else {
         XvPutImage(x11->display, ctx->xv_port, x11->window, ctx->vo_gc, xvi,
                    src->x0, src->y0, sw, sh,
                    dst->x0, dst->y0, dw, dh);
@@ -677,7 +663,6 @@ static struct mp_image get_xv_buffer(struct vo *vo, int buf_index)
 
 static void wait_for_completion(struct vo *vo, int max_outstanding)
 {
-#if HAVE_SHM
     struct xvctx *ctx = vo->priv;
     struct vo_x11_state *x11 = vo->x11;
     if (ctx->Shmem_Flag) {
@@ -691,7 +676,6 @@ static void wait_for_completion(struct vo *vo, int max_outstanding)
             vo_x11_check_events(vo);
         }
     }
-#endif
 }
 
 static void flip_page(struct vo *vo)
diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h
new file mode 100644
index 0000000..6e82bfa
--- /dev/null
+++ b/video/out/vulkan/common.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "common/common.h"
+#include "common/msg.h"
+
+// We need to define all platforms we want to support. Since we have
+// our own mechanism for checking this, we re-define the right symbols
+#if HAVE_WAYLAND
+#define VK_USE_PLATFORM_WAYLAND_KHR
+#endif
+#if HAVE_X11
+#define VK_USE_PLATFORM_XLIB_KHR
+#endif
+#if HAVE_WIN32_DESKTOP
+#define VK_USE_PLATFORM_WIN32_KHR
+#endif
+
+#include <vulkan/vulkan.h>
+
+// Vulkan allows the optional use of a custom allocator. We don't need one but
+// mark this parameter with a better name in case we ever decide to change this
+// in the future. (And to make the code more readable)
+#define MPVK_ALLOCATOR NULL
+
+// A lot of things depend on streaming resources across frames. Depending on
+// how many frames we render ahead of time, we need to pick enough to avoid
+// any conflicts, so make all of these tunable relative to this constant in
+// order to centralize them.
+#define MPVK_MAX_STREAMING_DEPTH 8
+
+// Shared struct used to hold vulkan context information
+struct mpvk_ctx {
+    struct mp_log *log;
+    VkInstance inst;
+    VkPhysicalDevice physd;
+    VkDebugReportCallbackEXT dbg;
+    VkDevice dev;
+
+    // Surface, must be initialized fter the context itself
+    VkSurfaceKHR surf;
+    VkSurfaceFormatKHR surf_format; // picked at surface initialization time
+
+    struct vk_malloc *alloc; // memory allocator for this device
+    struct vk_cmdpool *pool; // primary command pool for this device
+    struct vk_cmd *last_cmd; // most recently submitted command
+    struct spirv_compiler *spirv; // GLSL -> SPIR-V compiler
+
+    // Cached capabilities
+    VkPhysicalDeviceLimits limits;
+};
diff --git a/video/out/vulkan/context.c b/video/out/vulkan/context.c
new file mode 100644
index 0000000..0bca198
--- /dev/null
+++ b/video/out/vulkan/context.c
@@ -0,0 +1,518 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "options/m_config.h"
+#include "video/out/gpu/spirv.h"
+
+#include "context.h"
+#include "ra_vk.h"
+#include "utils.h"
+
+enum {
+    SWAP_AUTO = 0,
+    SWAP_FIFO,
+    SWAP_FIFO_RELAXED,
+    SWAP_MAILBOX,
+    SWAP_IMMEDIATE,
+    SWAP_COUNT,
+};
+
+struct vulkan_opts {
+    struct mpvk_device_opts dev_opts; // logical device options
+    char *device; // force a specific GPU
+    int swap_mode;
+};
+
+static int vk_validate_dev(struct mp_log *log, const struct m_option *opt,
+                           struct bstr name, struct bstr param)
+{
+    int ret = M_OPT_INVALID;
+    VkResult res;
+
+    // Create a dummy instance to validate/list the devices
+    VkInstanceCreateInfo info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+    };
+
+    VkInstance inst;
+    VkPhysicalDevice *devices = NULL;
+    uint32_t num = 0;
+
+    res = vkCreateInstance(&info, MPVK_ALLOCATOR, &inst);
+    if (res != VK_SUCCESS)
+        goto error;
+
+    res = vkEnumeratePhysicalDevices(inst, &num, NULL);
+    if (res != VK_SUCCESS)
+        goto error;
+
+    devices = talloc_array(NULL, VkPhysicalDevice, num);
+    vkEnumeratePhysicalDevices(inst, &num, devices);
+    if (res != VK_SUCCESS)
+        goto error;
+
+    bool help = bstr_equals0(param, "help");
+    if (help) {
+        mp_info(log, "Available vulkan devices:\n");
+        ret = M_OPT_EXIT;
+    }
+
+    for (int i = 0; i < num; i++) {
+        VkPhysicalDeviceProperties prop;
+        vkGetPhysicalDeviceProperties(devices[i], &prop);
+
+        if (help) {
+            mp_info(log, "  '%s' (GPU %d, ID %x:%x)\n", prop.deviceName, i,
+                    (unsigned)prop.vendorID, (unsigned)prop.deviceID);
+        } else if (bstr_equals0(param, prop.deviceName)) {
+            ret = 0;
+            break;
+        }
+    }
+
+    if (!help)
+        mp_err(log, "No device with name '%.*s'!\n", BSTR_P(param));
+
+error:
+    talloc_free(devices);
+    return ret;
+}
+
+#define OPT_BASE_STRUCT struct vulkan_opts
+const struct m_sub_options vulkan_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_STRING_VALIDATE("vulkan-device", device, 0, vk_validate_dev),
+        OPT_CHOICE("vulkan-swap-mode", swap_mode, 0,
+                   ({"auto",        SWAP_AUTO},
+                   {"fifo",         SWAP_FIFO},
+                   {"fifo-relaxed", SWAP_FIFO_RELAXED},
+                   {"mailbox",      SWAP_MAILBOX},
+                   {"immediate",    SWAP_IMMEDIATE})),
+        OPT_INTRANGE("vulkan-queue-count", dev_opts.queue_count, 0, 1,
+                     MPVK_MAX_QUEUES, OPTDEF_INT(1)),
+        {0}
+    },
+    .size = sizeof(struct vulkan_opts)
+};
+
+struct priv {
+    struct mpvk_ctx *vk;
+    struct vulkan_opts *opts;
+    // Swapchain metadata:
+    int w, h;                 // current size
+    VkSwapchainCreateInfoKHR protoInfo; // partially filled-in prototype
+    VkSwapchainKHR swapchain;
+    VkSwapchainKHR old_swapchain;
+    int frames_in_flight;
+    // state of the images:
+    struct ra_tex **images;   // ra_tex wrappers for the vkimages
+    int num_images;           // size of images
+    VkSemaphore *acquired;    // pool of semaphores used to synchronize images
+    int num_acquired;         // size of this pool
+    int idx_acquired;         // index of next free semaphore within this pool
+    int last_imgidx;          // the image index last acquired (for submit)
+};
+
+static const struct ra_swapchain_fns vulkan_swapchain;
+
+struct mpvk_ctx *ra_vk_ctx_get(struct ra_ctx *ctx)
+{
+    if (ctx->swapchain->fns != &vulkan_swapchain)
+        return NULL;
+
+    struct priv *p = ctx->swapchain->priv;
+    return p->vk;
+}
+
+static bool update_swapchain_info(struct priv *p,
+                                  VkSwapchainCreateInfoKHR *info)
+{
+    struct mpvk_ctx *vk = p->vk;
+
+    // Query the supported capabilities and update this struct as needed
+    VkSurfaceCapabilitiesKHR caps;
+    VK(vkGetPhysicalDeviceSurfaceCapabilitiesKHR(vk->physd, vk->surf, &caps));
+
+    // Sorted by preference
+    static const VkCompositeAlphaFlagsKHR alphaModes[] = {
+        VK_COMPOSITE_ALPHA_POST_MULTIPLIED_BIT_KHR,
+        VK_COMPOSITE_ALPHA_OPAQUE_BIT_KHR,
+    };
+
+    for (int i = 0; i < MP_ARRAY_SIZE(alphaModes); i++) {
+        if (caps.supportedCompositeAlpha & alphaModes[i]) {
+            info->compositeAlpha = alphaModes[i];
+            break;
+        }
+    }
+
+    if (!info->compositeAlpha) {
+        MP_ERR(vk, "Failed picking alpha compositing mode (caps: 0x%x)\n",
+               caps.supportedCompositeAlpha);
+        goto error;
+    }
+
+    static const VkSurfaceTransformFlagsKHR rotModes[] = {
+        VK_SURFACE_TRANSFORM_IDENTITY_BIT_KHR,
+        VK_SURFACE_TRANSFORM_INHERIT_BIT_KHR,
+    };
+
+    for (int i = 0; i < MP_ARRAY_SIZE(rotModes); i++) {
+        if (caps.supportedTransforms & rotModes[i]) {
+            info->preTransform = rotModes[i];
+            break;
+        }
+    }
+
+    if (!info->preTransform) {
+        MP_ERR(vk, "Failed picking surface transform mode (caps: 0x%x)\n",
+               caps.supportedTransforms);
+        goto error;
+    }
+
+    // Image count as required
+    MP_VERBOSE(vk, "Requested image count: %d (min %d max %d)\n",
+               (int)info->minImageCount, (int)caps.minImageCount,
+               (int)caps.maxImageCount);
+
+    info->minImageCount = MPMAX(info->minImageCount, caps.minImageCount);
+    if (caps.maxImageCount)
+        info->minImageCount = MPMIN(info->minImageCount, caps.maxImageCount);
+
+    // Check the extent against the allowed parameters
+    if (caps.currentExtent.width != info->imageExtent.width &&
+        caps.currentExtent.width != 0xFFFFFFFF)
+    {
+        MP_WARN(vk, "Requested width %d does not match current width %d\n",
+                (int)info->imageExtent.width, (int)caps.currentExtent.width);
+        info->imageExtent.width = caps.currentExtent.width;
+    }
+
+    if (caps.currentExtent.height != info->imageExtent.height &&
+        caps.currentExtent.height != 0xFFFFFFFF)
+    {
+        MP_WARN(vk, "Requested height %d does not match current height %d\n",
+                (int)info->imageExtent.height, (int)caps.currentExtent.height);
+        info->imageExtent.height = caps.currentExtent.height;
+    }
+
+    if (caps.minImageExtent.width  > info->imageExtent.width ||
+        caps.minImageExtent.height > info->imageExtent.height)
+    {
+        MP_ERR(vk, "Requested size %dx%d smaller than device minimum %d%d\n",
+               (int)info->imageExtent.width, (int)info->imageExtent.height,
+               (int)caps.minImageExtent.width, (int)caps.minImageExtent.height);
+        goto error;
+    }
+
+    if (caps.maxImageExtent.width  < info->imageExtent.width ||
+        caps.maxImageExtent.height < info->imageExtent.height)
+    {
+        MP_ERR(vk, "Requested size %dx%d larger than device maximum %d%d\n",
+               (int)info->imageExtent.width, (int)info->imageExtent.height,
+               (int)caps.maxImageExtent.width, (int)caps.maxImageExtent.height);
+        goto error;
+    }
+
+    // We just request whatever usage we can, and let the ra_vk decide what
+    // ra_tex_params that translates to. This makes the images as flexible
+    // as possible.
+    info->imageUsage = caps.supportedUsageFlags;
+    return true;
+
+error:
+    return false;
+}
+
+void ra_vk_ctx_uninit(struct ra_ctx *ctx)
+{
+    if (ctx->ra) {
+        struct priv *p = ctx->swapchain->priv;
+        struct mpvk_ctx *vk = p->vk;
+
+        mpvk_pool_wait_idle(vk, vk->pool);
+
+        for (int i = 0; i < p->num_images; i++)
+            ra_tex_free(ctx->ra, &p->images[i]);
+        for (int i = 0; i < p->num_acquired; i++)
+            vkDestroySemaphore(vk->dev, p->acquired[i], MPVK_ALLOCATOR);
+
+        vkDestroySwapchainKHR(vk->dev, p->swapchain, MPVK_ALLOCATOR);
+
+        talloc_free(p->images);
+        talloc_free(p->acquired);
+        ctx->ra->fns->destroy(ctx->ra);
+        ctx->ra = NULL;
+    }
+
+    talloc_free(ctx->swapchain);
+    ctx->swapchain = NULL;
+}
+
+static const struct ra_swapchain_fns vulkan_swapchain;
+
+bool ra_vk_ctx_init(struct ra_ctx *ctx, struct mpvk_ctx *vk,
+                    VkPresentModeKHR preferred_mode)
+{
+    struct ra_swapchain *sw = ctx->swapchain = talloc_zero(NULL, struct ra_swapchain);
+    sw->ctx = ctx;
+    sw->fns = &vulkan_swapchain;
+
+    struct priv *p = sw->priv = talloc_zero(sw, struct priv);
+    p->vk = vk;
+    p->opts = mp_get_config_group(p, ctx->global, &vulkan_conf);
+
+    if (!mpvk_find_phys_device(vk, p->opts->device, ctx->opts.allow_sw))
+        goto error;
+    if (!spirv_compiler_init(ctx))
+        goto error;
+    vk->spirv = ctx->spirv;
+    if (!mpvk_pick_surface_format(vk))
+        goto error;
+    if (!mpvk_device_init(vk, p->opts->dev_opts))
+        goto error;
+
+    ctx->ra = ra_create_vk(vk, ctx->log);
+    if (!ctx->ra)
+        goto error;
+
+    static const VkPresentModeKHR present_modes[SWAP_COUNT] = {
+        [SWAP_FIFO]         = VK_PRESENT_MODE_FIFO_KHR,
+        [SWAP_FIFO_RELAXED] = VK_PRESENT_MODE_FIFO_RELAXED_KHR,
+        [SWAP_MAILBOX]      = VK_PRESENT_MODE_MAILBOX_KHR,
+        [SWAP_IMMEDIATE]    = VK_PRESENT_MODE_IMMEDIATE_KHR,
+    };
+
+    p->protoInfo = (VkSwapchainCreateInfoKHR) {
+        .sType = VK_STRUCTURE_TYPE_SWAPCHAIN_CREATE_INFO_KHR,
+        .surface = vk->surf,
+        .imageFormat = vk->surf_format.format,
+        .imageColorSpace = vk->surf_format.colorSpace,
+        .imageArrayLayers = 1, // non-stereoscopic
+        .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .minImageCount = ctx->opts.swapchain_depth + 1, // +1 for FB
+        .presentMode = p->opts->swap_mode ? present_modes[p->opts->swap_mode]
+                                          : preferred_mode,
+        .clipped = true,
+    };
+
+    // Make sure the swapchain present mode is supported
+    int num_modes;
+    VK(vkGetPhysicalDeviceSurfacePresentModesKHR(vk->physd, vk->surf,
+                                                 &num_modes, NULL));
+    VkPresentModeKHR *modes = talloc_array(NULL, VkPresentModeKHR, num_modes);
+    VK(vkGetPhysicalDeviceSurfacePresentModesKHR(vk->physd, vk->surf,
+                                                 &num_modes, modes));
+    bool supported = false;
+    for (int i = 0; i < num_modes; i++)
+        supported |= (modes[i] == p->protoInfo.presentMode);
+    talloc_free(modes);
+
+    if (!supported) {
+        MP_ERR(ctx, "Requested swap mode unsupported by this device!\n");
+        goto error;
+    }
+
+    return true;
+
+error:
+    ra_vk_ctx_uninit(ctx);
+    return false;
+}
+
+static void destroy_swapchain(struct mpvk_ctx *vk, struct priv *p)
+{
+    assert(p->old_swapchain);
+    vkDestroySwapchainKHR(vk->dev, p->old_swapchain, MPVK_ALLOCATOR);
+    p->old_swapchain = NULL;
+}
+
+bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h)
+{
+    struct priv *p = sw->priv;
+    if (w == p->w && h == p->h)
+        return true;
+
+    struct ra *ra = sw->ctx->ra;
+    struct mpvk_ctx *vk = p->vk;
+    VkImage *vkimages = NULL;
+
+    // It's invalid to trigger another swapchain recreation while there's
+    // more than one swapchain already active, so we need to flush any pending
+    // asynchronous swapchain release operations that may be ongoing.
+    while (p->old_swapchain)
+        mpvk_dev_poll_cmds(vk, 100000); // 100μs
+
+    VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
+    sinfo.imageExtent  = (VkExtent2D){ w, h };
+    sinfo.oldSwapchain = p->swapchain;
+
+    if (!update_swapchain_info(p, &sinfo))
+        goto error;
+
+    VK(vkCreateSwapchainKHR(vk->dev, &sinfo, MPVK_ALLOCATOR, &p->swapchain));
+    p->w = w;
+    p->h = h;
+
+    // Freeing the old swapchain while it's still in use is an error, so do
+    // it asynchronously once the device is idle.
+    if (sinfo.oldSwapchain) {
+        p->old_swapchain = sinfo.oldSwapchain;
+        vk_dev_callback(vk, (vk_cb) destroy_swapchain, vk, p);
+    }
+
+    // Get the new swapchain images
+    int num;
+    VK(vkGetSwapchainImagesKHR(vk->dev, p->swapchain, &num, NULL));
+    vkimages = talloc_array(NULL, VkImage, num);
+    VK(vkGetSwapchainImagesKHR(vk->dev, p->swapchain, &num, vkimages));
+
+    // If needed, allocate some more semaphores
+    while (num > p->num_acquired) {
+        VkSemaphore sem;
+        static const VkSemaphoreCreateInfo seminfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        };
+        VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem));
+        MP_TARRAY_APPEND(NULL, p->acquired, p->num_acquired, sem);
+    }
+
+    // Recreate the ra_tex wrappers
+    for (int i = 0; i < p->num_images; i++)
+        ra_tex_free(ra, &p->images[i]);
+
+    p->num_images = num;
+    MP_TARRAY_GROW(NULL, p->images, p->num_images);
+    for (int i = 0; i < num; i++) {
+        p->images[i] = ra_vk_wrap_swapchain_img(ra, vkimages[i], sinfo);
+        if (!p->images[i])
+            goto error;
+    }
+
+    talloc_free(vkimages);
+    return true;
+
+error:
+    talloc_free(vkimages);
+    vkDestroySwapchainKHR(vk->dev, p->swapchain, MPVK_ALLOCATOR);
+    p->swapchain = NULL;
+    return false;
+}
+
+static int color_depth(struct ra_swapchain *sw)
+{
+    struct priv *p = sw->priv;
+    int bits = 0;
+
+    if (!p->num_images)
+        return bits;
+
+    // The channel with the most bits is probably the most authoritative about
+    // the actual color information (consider e.g. a2bgr10). Slight downside
+    // in that it results in rounding r/b for e.g. rgb565, but we don't pick
+    // surfaces with fewer than 8 bits anyway.
+    const struct ra_format *fmt = p->images[0]->params.format;
+    for (int i = 0; i < fmt->num_components; i++) {
+        int depth = fmt->component_depth[i];
+        bits = MPMAX(bits, depth ? depth : fmt->component_size[i]);
+    }
+
+    return bits;
+}
+
+static bool start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
+{
+    struct priv *p = sw->priv;
+    struct mpvk_ctx *vk = p->vk;
+    if (!p->swapchain)
+        goto error;
+
+    uint32_t imgidx = 0;
+    MP_TRACE(vk, "vkAcquireNextImageKHR\n");
+    VkResult res = vkAcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
+                                         p->acquired[p->idx_acquired], NULL,
+                                         &imgidx);
+    if (res == VK_ERROR_OUT_OF_DATE_KHR)
+        goto error; // just return in this case
+    VK_ASSERT(res, "Failed acquiring swapchain image");
+
+    p->last_imgidx = imgidx;
+    *out_fbo = (struct ra_fbo) {
+        .tex = p->images[imgidx],
+        .flip = false,
+    };
+    return true;
+
+error:
+    return false;
+}
+
+static bool submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
+{
+    struct priv *p = sw->priv;
+    struct ra *ra = sw->ctx->ra;
+    struct mpvk_ctx *vk = p->vk;
+    if (!p->swapchain)
+        goto error;
+
+    VkSemaphore acquired = p->acquired[p->idx_acquired++];
+    p->idx_acquired %= p->num_acquired;
+
+    VkSemaphore done;
+    if (!ra_vk_submit(ra, p->images[p->last_imgidx], acquired, &done,
+                      &p->frames_in_flight))
+        goto error;
+
+    // Older nvidia drivers can spontaneously combust when submitting to the
+    // same queue as we're rendering from, in a multi-queue scenario. Safest
+    // option is to cycle the queues first and then submit to the next queue.
+    // We can drop this hack in the future, I suppose.
+    vk_cmd_cycle_queues(vk);
+    struct vk_cmdpool *pool = vk->pool;
+    VkQueue queue = pool->queues[pool->qindex];
+
+    VkPresentInfoKHR pinfo = {
+        .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
+        .waitSemaphoreCount = 1,
+        .pWaitSemaphores = &done,
+        .swapchainCount = 1,
+        .pSwapchains = &p->swapchain,
+        .pImageIndices = &p->last_imgidx,
+    };
+
+    VK(vkQueuePresentKHR(queue, &pinfo));
+    return true;
+
+error:
+    return false;
+}
+
+static void swap_buffers(struct ra_swapchain *sw)
+{
+    struct priv *p = sw->priv;
+
+    while (p->frames_in_flight >= sw->ctx->opts.swapchain_depth)
+        mpvk_dev_poll_cmds(p->vk, 100000); // 100μs
+}
+
+static const struct ra_swapchain_fns vulkan_swapchain = {
+    // .screenshot is not currently supported
+    .color_depth   = color_depth,
+    .start_frame   = start_frame,
+    .submit_frame  = submit_frame,
+    .swap_buffers  = swap_buffers,
+};
diff --git a/video/out/vulkan/context.h b/video/out/vulkan/context.h
new file mode 100644
index 0000000..a64d39f
--- /dev/null
+++ b/video/out/vulkan/context.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "video/out/gpu/context.h"
+#include "common.h"
+
+// Helpers for ra_ctx based on ra_vk. These initialize ctx->ra and ctx->swchain.
+void ra_vk_ctx_uninit(struct ra_ctx *ctx);
+bool ra_vk_ctx_init(struct ra_ctx *ctx, struct mpvk_ctx *vk,
+                    VkPresentModeKHR preferred_mode);
+bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h);
+
+// May be called on a ra_ctx of any type.
+struct mpvk_ctx *ra_vk_ctx_get(struct ra_ctx *ctx);
diff --git a/video/out/vulkan/context_wayland.c b/video/out/vulkan/context_wayland.c
new file mode 100644
index 0000000..7276775
--- /dev/null
+++ b/video/out/vulkan/context_wayland.c
@@ -0,0 +1,133 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "video/out/gpu/context.h"
+#include "video/out/wayland_common.h"
+
+#include "common.h"
+#include "context.h"
+#include "utils.h"
+
+struct priv {
+    struct mpvk_ctx vk;
+};
+
+static void wayland_vk_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    ra_vk_ctx_uninit(ctx);
+    mpvk_uninit(&p->vk);
+    vo_wayland_uninit(ctx->vo);
+}
+
+static bool wayland_vk_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    struct mpvk_ctx *vk = &p->vk;
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR;
+
+    if (!mpvk_instance_init(vk, ctx->log, VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME,
+                            ctx->opts.debug))
+        goto error;
+
+    if (!vo_wayland_init(ctx->vo))
+        goto error;
+
+    VkWaylandSurfaceCreateInfoKHR wlinfo = {
+         .sType   = VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR,
+         .display = ctx->vo->wl->display,
+         .surface = ctx->vo->wl->surface,
+    };
+
+    VkResult res = vkCreateWaylandSurfaceKHR(vk->inst, &wlinfo, MPVK_ALLOCATOR,
+                                             &vk->surf);
+    if (res != VK_SUCCESS) {
+        MP_MSG(ctx, msgl, "Failed creating Wayland surface: %s\n", vk_err(res));
+        goto error;
+    }
+
+    /* Because in Wayland clients render whenever they receive a callback from
+     * the compositor, and the fact that the compositor usually stops sending
+     * callbacks once the surface is no longer visible, using FIFO here would
+     * mean the entire player would block on acquiring swapchain images. Hence,
+     * use MAILBOX to guarantee that there'll always be a swapchain image and
+     * the player won't block waiting on those */
+    if (!ra_vk_ctx_init(ctx, vk, VK_PRESENT_MODE_MAILBOX_KHR))
+        goto error;
+
+    return true;
+
+error:
+    wayland_vk_uninit(ctx);
+    return false;
+}
+
+static void resize(struct ra_ctx *ctx)
+{
+    struct vo_wayland_state *wl = ctx->vo->wl;
+
+    MP_VERBOSE(wl, "Handling resize on the vk side\n");
+
+    const int32_t width = wl->scaling*mp_rect_w(wl->geometry);
+    const int32_t height = wl->scaling*mp_rect_h(wl->geometry);
+
+    wl_surface_set_buffer_scale(wl->surface, wl->scaling);
+
+    wl->vo->dwidth  = width;
+    wl->vo->dheight = height;
+}
+
+static bool wayland_vk_reconfig(struct ra_ctx *ctx)
+{
+    if (!vo_wayland_reconfig(ctx->vo))
+        return false;
+
+    return true;
+}
+
+static int wayland_vk_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    int ret = vo_wayland_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE) {
+        resize(ctx);
+        if (ra_vk_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight))
+            return VO_ERROR;
+    }
+    return ret;
+}
+
+static void wayland_vk_wakeup(struct ra_ctx *ctx)
+{
+    vo_wayland_wakeup(ctx->vo);
+}
+
+static void wayland_vk_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
+{
+    vo_wayland_wait_events(ctx->vo, until_time_us);
+}
+
+const struct ra_ctx_fns ra_ctx_vulkan_wayland = {
+    .type           = "vulkan",
+    .name           = "waylandvk",
+    .reconfig       = wayland_vk_reconfig,
+    .control        = wayland_vk_control,
+    .wakeup         = wayland_vk_wakeup,
+    .wait_events    = wayland_vk_wait_events,
+    .init           = wayland_vk_init,
+    .uninit         = wayland_vk_uninit,
+};
diff --git a/video/out/vulkan/context_win.c b/video/out/vulkan/context_win.c
new file mode 100644
index 0000000..cf31586
--- /dev/null
+++ b/video/out/vulkan/context_win.c
@@ -0,0 +1,105 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "video/out/gpu/context.h"
+#include "video/out/w32_common.h"
+
+#include "common.h"
+#include "context.h"
+#include "utils.h"
+
+EXTERN_C IMAGE_DOS_HEADER __ImageBase;
+#define HINST_THISCOMPONENT ((HINSTANCE)&__ImageBase)
+
+struct priv {
+    struct mpvk_ctx vk;
+};
+
+static void win_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    ra_vk_ctx_uninit(ctx);
+    mpvk_uninit(&p->vk);
+    vo_w32_uninit(ctx->vo);
+}
+
+static bool win_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    struct mpvk_ctx *vk = &p->vk;
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR;
+
+    if (!mpvk_instance_init(vk, ctx->log, VK_KHR_WIN32_SURFACE_EXTENSION_NAME,
+                            ctx->opts.debug))
+        goto error;
+
+    if (!vo_w32_init(ctx->vo))
+        goto error;
+
+    VkWin32SurfaceCreateInfoKHR wininfo = {
+         .sType = VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR,
+         .hinstance = HINST_THISCOMPONENT,
+         .hwnd = vo_w32_hwnd(ctx->vo),
+    };
+
+    VkResult res = vkCreateWin32SurfaceKHR(vk->inst, &wininfo, MPVK_ALLOCATOR,
+                                           &vk->surf);
+    if (res != VK_SUCCESS) {
+        MP_MSG(ctx, msgl, "Failed creating Windows surface: %s\n", vk_err(res));
+        goto error;
+    }
+
+    if (!ra_vk_ctx_init(ctx, vk, VK_PRESENT_MODE_FIFO_KHR))
+        goto error;
+
+    return true;
+
+error:
+    win_uninit(ctx);
+    return false;
+}
+
+static bool resize(struct ra_ctx *ctx)
+{
+    return ra_vk_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight);
+}
+
+static bool win_reconfig(struct ra_ctx *ctx)
+{
+    vo_w32_config(ctx->vo);
+    return resize(ctx);
+}
+
+static int win_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    int ret = vo_w32_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE) {
+        if (!resize(ctx))
+            return VO_ERROR;
+    }
+    return ret;
+}
+
+const struct ra_ctx_fns ra_ctx_vulkan_win = {
+    .type           = "vulkan",
+    .name           = "winvk",
+    .reconfig       = win_reconfig,
+    .control        = win_control,
+    .init           = win_init,
+    .uninit         = win_uninit,
+};
diff --git a/video/out/vulkan/context_xlib.c b/video/out/vulkan/context_xlib.c
new file mode 100644
index 0000000..c3bd49f
--- /dev/null
+++ b/video/out/vulkan/context_xlib.c
@@ -0,0 +1,117 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "video/out/gpu/context.h"
+#include "video/out/x11_common.h"
+
+#include "common.h"
+#include "context.h"
+#include "utils.h"
+
+struct priv {
+    struct mpvk_ctx vk;
+};
+
+static void xlib_uninit(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    ra_vk_ctx_uninit(ctx);
+    mpvk_uninit(&p->vk);
+    vo_x11_uninit(ctx->vo);
+}
+
+static bool xlib_init(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv = talloc_zero(ctx, struct priv);
+    struct mpvk_ctx *vk = &p->vk;
+    int msgl = ctx->opts.probing ? MSGL_V : MSGL_ERR;
+
+    if (!mpvk_instance_init(vk, ctx->log, VK_KHR_XLIB_SURFACE_EXTENSION_NAME,
+                            ctx->opts.debug))
+        goto error;
+
+    if (!vo_x11_init(ctx->vo))
+        goto error;
+
+    if (!vo_x11_create_vo_window(ctx->vo, NULL, "mpvk"))
+        goto error;
+
+    VkXlibSurfaceCreateInfoKHR xinfo = {
+         .sType = VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR,
+         .dpy = ctx->vo->x11->display,
+         .window = ctx->vo->x11->window,
+    };
+
+    VkResult res = vkCreateXlibSurfaceKHR(vk->inst, &xinfo, MPVK_ALLOCATOR,
+                                          &vk->surf);
+    if (res != VK_SUCCESS) {
+        MP_MSG(ctx, msgl, "Failed creating Xlib surface: %s\n", vk_err(res));
+        goto error;
+    }
+
+    if (!ra_vk_ctx_init(ctx, vk, VK_PRESENT_MODE_FIFO_KHR))
+        goto error;
+
+    return true;
+
+error:
+    xlib_uninit(ctx);
+    return false;
+}
+
+static bool resize(struct ra_ctx *ctx)
+{
+    return ra_vk_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight);
+}
+
+static bool xlib_reconfig(struct ra_ctx *ctx)
+{
+    vo_x11_config_vo_window(ctx->vo);
+    return resize(ctx);
+}
+
+static int xlib_control(struct ra_ctx *ctx, int *events, int request, void *arg)
+{
+    int ret = vo_x11_control(ctx->vo, events, request, arg);
+    if (*events & VO_EVENT_RESIZE) {
+        if (!resize(ctx))
+            return VO_ERROR;
+    }
+    return ret;
+}
+
+static void xlib_wakeup(struct ra_ctx *ctx)
+{
+    vo_x11_wakeup(ctx->vo);
+}
+
+static void xlib_wait_events(struct ra_ctx *ctx, int64_t until_time_us)
+{
+    vo_x11_wait_events(ctx->vo, until_time_us);
+}
+
+const struct ra_ctx_fns ra_ctx_vulkan_xlib = {
+    .type           = "vulkan",
+    .name           = "x11vk",
+    .reconfig       = xlib_reconfig,
+    .control        = xlib_control,
+    .wakeup         = xlib_wakeup,
+    .wait_events    = xlib_wait_events,
+    .init           = xlib_init,
+    .uninit         = xlib_uninit,
+};
diff --git a/video/out/vulkan/formats.c b/video/out/vulkan/formats.c
new file mode 100644
index 0000000..b44bead
--- /dev/null
+++ b/video/out/vulkan/formats.c
@@ -0,0 +1,55 @@
+#include "formats.h"
+
+const struct vk_format vk_formats[] = {
+    // Regular, byte-aligned integer formats
+    {"r8",       VK_FORMAT_R8_UNORM,                  1,  1,   {8             }, RA_CTYPE_UNORM },
+    {"rg8",      VK_FORMAT_R8G8_UNORM,                2,  2,   {8,  8         }, RA_CTYPE_UNORM },
+    {"rgb8",     VK_FORMAT_R8G8B8_UNORM,              3,  3,   {8,  8,  8     }, RA_CTYPE_UNORM },
+    {"rgba8",    VK_FORMAT_R8G8B8A8_UNORM,            4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM },
+    {"r16",      VK_FORMAT_R16_UNORM,                 1,  2,   {16            }, RA_CTYPE_UNORM },
+    {"rg16",     VK_FORMAT_R16G16_UNORM,              2,  4,   {16, 16        }, RA_CTYPE_UNORM },
+    {"rgb16",    VK_FORMAT_R16G16B16_UNORM,           3,  6,   {16, 16, 16    }, RA_CTYPE_UNORM },
+    {"rgba16",   VK_FORMAT_R16G16B16A16_UNORM,        4,  8,   {16, 16, 16, 16}, RA_CTYPE_UNORM },
+
+    // Special, integer-only formats
+    {"r32ui",    VK_FORMAT_R32_UINT,                  1,  4,   {32            }, RA_CTYPE_UINT },
+    {"rg32ui",   VK_FORMAT_R32G32_UINT,               2,  8,   {32, 32        }, RA_CTYPE_UINT },
+    {"rgb32ui",  VK_FORMAT_R32G32B32_UINT,            3,  12,  {32, 32, 32    }, RA_CTYPE_UINT },
+    {"rgba32ui", VK_FORMAT_R32G32B32A32_UINT,         4,  16,  {32, 32, 32, 32}, RA_CTYPE_UINT },
+    {"r64ui",    VK_FORMAT_R64_UINT,                  1,  8,   {64            }, RA_CTYPE_UINT },
+    {"rg64ui",   VK_FORMAT_R64G64_UINT,               2,  16,  {64, 64        }, RA_CTYPE_UINT },
+    {"rgb64ui",  VK_FORMAT_R64G64B64_UINT,            3,  24,  {64, 64, 64    }, RA_CTYPE_UINT },
+    {"rgba64ui", VK_FORMAT_R64G64B64A64_UINT,         4,  32,  {64, 64, 64, 64}, RA_CTYPE_UINT },
+
+    // Packed integer formats
+    {"rg4",      VK_FORMAT_R4G4_UNORM_PACK8,          2,  1,   {4,  4         }, RA_CTYPE_UNORM },
+    {"rgba4",    VK_FORMAT_R4G4B4A4_UNORM_PACK16,     4,  2,   {4,  4,  4,  4 }, RA_CTYPE_UNORM },
+    {"rgb565",   VK_FORMAT_R5G6B5_UNORM_PACK16,       3,  2,   {5,  6,  5     }, RA_CTYPE_UNORM },
+    {"rgb565a1", VK_FORMAT_R5G5B5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM },
+
+    // Float formats (native formats, hf = half float, df = double float)
+    {"r16hf",    VK_FORMAT_R16_SFLOAT,                1,  2,   {16            }, RA_CTYPE_FLOAT },
+    {"rg16hf",   VK_FORMAT_R16G16_SFLOAT,             2,  4,   {16, 16        }, RA_CTYPE_FLOAT },
+    {"rgb16hf",  VK_FORMAT_R16G16B16_SFLOAT,          3,  6,   {16, 16, 16    }, RA_CTYPE_FLOAT },
+    {"rgba16hf", VK_FORMAT_R16G16B16A16_SFLOAT,       4,  8,   {16, 16, 16, 16}, RA_CTYPE_FLOAT },
+    {"r32f",     VK_FORMAT_R32_SFLOAT,                1,  4,   {32            }, RA_CTYPE_FLOAT },
+    {"rg32f",    VK_FORMAT_R32G32_SFLOAT,             2,  8,   {32, 32        }, RA_CTYPE_FLOAT },
+    {"rgb32f",   VK_FORMAT_R32G32B32_SFLOAT,          3, 12,   {32, 32, 32    }, RA_CTYPE_FLOAT },
+    {"rgba32f",  VK_FORMAT_R32G32B32A32_SFLOAT,       4, 16,   {32, 32, 32, 32}, RA_CTYPE_FLOAT },
+    {"r64df",    VK_FORMAT_R64_SFLOAT,                1,  8,   {64            }, RA_CTYPE_FLOAT },
+    {"rg64df",   VK_FORMAT_R64G64_SFLOAT,             2, 16,   {64, 64        }, RA_CTYPE_FLOAT },
+    {"rgb64df",  VK_FORMAT_R64G64B64_SFLOAT,          3, 24,   {64, 64, 64    }, RA_CTYPE_FLOAT },
+    {"rgba64df", VK_FORMAT_R64G64B64A64_SFLOAT,       4, 32,   {64, 64, 64, 64}, RA_CTYPE_FLOAT },
+
+    // "Swapped" component order images
+    {"bgr8",     VK_FORMAT_B8G8R8_UNORM,              3,  3,   {8,  8,  8     }, RA_CTYPE_UNORM, true },
+    {"bgra8",    VK_FORMAT_B8G8R8A8_UNORM,            4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM, true },
+    {"bgra4",    VK_FORMAT_B4G4R4A4_UNORM_PACK16,     4,  2,   {4,  4,  4,  4 }, RA_CTYPE_UNORM, true },
+    {"bgr565",   VK_FORMAT_B5G6R5_UNORM_PACK16,       3,  2,   {5,  6,  5     }, RA_CTYPE_UNORM, true },
+    {"bgr565a1", VK_FORMAT_B5G5R5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM, true },
+    {"a1rgb5",   VK_FORMAT_A1R5G5B5_UNORM_PACK16,     4,  2,   {1,  5,  5,  5 }, RA_CTYPE_UNORM, true },
+    {"a2rgb10",  VK_FORMAT_A2R10G10B10_UNORM_PACK32,  4,  4,   {2,  10, 10, 10}, RA_CTYPE_UNORM, true },
+    {"a2bgr10",  VK_FORMAT_A2B10G10R10_UNORM_PACK32,  4,  4,   {2,  10, 10, 10}, RA_CTYPE_UNORM, true },
+    {"abgr8",    VK_FORMAT_A8B8G8R8_UNORM_PACK32,     4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM, true },
+    {0}
+};
diff --git a/video/out/vulkan/formats.h b/video/out/vulkan/formats.h
new file mode 100644
index 0000000..22782a6
--- /dev/null
+++ b/video/out/vulkan/formats.h
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "video/out/gpu/ra.h"
+#include "common.h"
+
+struct vk_format {
+    const char *name;
+    VkFormat iformat;    // vulkan format enum
+    int components;      // how many components are there
+    int bytes;           // how many bytes is a texel
+    int bits[4];         // how many bits per component
+    enum ra_ctype ctype; // format representation type
+    bool fucked_order;   // used for formats which are not simply rgba
+};
+
+extern const struct vk_format vk_formats[];
diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c
new file mode 100644
index 0000000..f6cb114
--- /dev/null
+++ b/video/out/vulkan/malloc.c
@@ -0,0 +1,423 @@
+#include "malloc.h"
+#include "utils.h"
+#include "osdep/timer.h"
+
+// Controls the multiplication factor for new slab allocations. The new slab
+// will always be allocated such that the size of the slab is this factor times
+// the previous slab. Higher values make it grow faster.
+#define MPVK_HEAP_SLAB_GROWTH_RATE 4
+
+// Controls the minimum slab size, to reduce the frequency at which very small
+// slabs would need to get allocated when allocating the first few buffers.
+// (Default: 1 MB)
+#define MPVK_HEAP_MINIMUM_SLAB_SIZE (1 << 20)
+
+// Controls the maximum slab size, to reduce the effect of unbounded slab
+// growth exhausting memory. If the application needs a single allocation
+// that's bigger than this value, it will be allocated directly from the
+// device. (Default: 512 MB)
+#define MPVK_HEAP_MAXIMUM_SLAB_SIZE (1 << 29)
+
+// Controls the minimum free region size, to reduce thrashing the free space
+// map with lots of small buffers during uninit. (Default: 1 KB)
+#define MPVK_HEAP_MINIMUM_REGION_SIZE (1 << 10)
+
+// Represents a region of available memory
+struct vk_region {
+    size_t start; // first offset in region
+    size_t end;   // first offset *not* in region
+};
+
+static inline size_t region_len(struct vk_region r)
+{
+    return r.end - r.start;
+}
+
+// A single slab represents a contiguous region of allocated memory. Actual
+// allocations are served as slices of this. Slabs are organized into linked
+// lists, which represent individual heaps.
+struct vk_slab {
+    VkDeviceMemory mem;   // underlying device allocation
+    size_t size;          // total size of `slab`
+    size_t used;          // number of bytes actually in use (for GC accounting)
+    bool dedicated;       // slab is allocated specifically for one object
+    // free space map: a sorted list of memory regions that are available
+    struct vk_region *regions;
+    int num_regions;
+    // optional, depends on the memory type:
+    VkBuffer buffer;      // buffer spanning the entire slab
+    void *data;           // mapped memory corresponding to `mem`
+};
+
+// Represents a single memory heap. We keep track of a vk_heap for each
+// combination of buffer type and memory selection parameters. This shouldn't
+// actually be that many in practice, because some combinations simply never
+// occur, and others will generally be the same for the same objects.
+struct vk_heap {
+    VkBufferUsageFlags usage;    // the buffer usage type (or 0)
+    VkMemoryPropertyFlags flags; // the memory type flags (or 0)
+    uint32_t typeBits;           // the memory type index requirements (or 0)
+    struct vk_slab **slabs;      // array of slabs sorted by size
+    int num_slabs;
+};
+
+// The overall state of the allocator, which keeps track of a vk_heap for each
+// memory type.
+struct vk_malloc {
+    VkPhysicalDeviceMemoryProperties props;
+    struct vk_heap *heaps;
+    int num_heaps;
+};
+
+static void slab_free(struct mpvk_ctx *vk, struct vk_slab *slab)
+{
+    if (!slab)
+        return;
+
+    assert(slab->used == 0);
+
+    int64_t start = mp_time_us();
+    vkDestroyBuffer(vk->dev, slab->buffer, MPVK_ALLOCATOR);
+    // also implicitly unmaps the memory if needed
+    vkFreeMemory(vk->dev, slab->mem, MPVK_ALLOCATOR);
+    int64_t stop = mp_time_us();
+
+    MP_VERBOSE(vk, "Freeing slab of size %zu took %lld μs.\n",
+               slab->size, (long long)(stop - start));
+
+    talloc_free(slab);
+}
+
+static bool find_best_memtype(struct mpvk_ctx *vk, uint32_t typeBits,
+                              VkMemoryPropertyFlags flags,
+                              VkMemoryType *out_type, int *out_index)
+{
+    struct vk_malloc *ma = vk->alloc;
+
+    // The vulkan spec requires memory types to be sorted in the "optimal"
+    // order, so the first matching type we find will be the best/fastest one.
+    for (int i = 0; i < ma->props.memoryTypeCount; i++) {
+        // The memory type flags must include our properties
+        if ((ma->props.memoryTypes[i].propertyFlags & flags) != flags)
+            continue;
+        // The memory type must be supported by the requirements (bitfield)
+        if (typeBits && !(typeBits & (1 << i)))
+            continue;
+        *out_type = ma->props.memoryTypes[i];
+        *out_index = i;
+        return true;
+    }
+
+    MP_ERR(vk, "Found no memory type matching property flags 0x%x and type "
+               "bits 0x%x!\n", (unsigned)flags, (unsigned)typeBits);
+    return false;
+}
+
+static struct vk_slab *slab_alloc(struct mpvk_ctx *vk, struct vk_heap *heap,
+                                  size_t size)
+{
+    struct vk_slab *slab = talloc_ptrtype(NULL, slab);
+    *slab = (struct vk_slab) {
+        .size = size,
+    };
+
+    MP_TARRAY_APPEND(slab, slab->regions, slab->num_regions, (struct vk_region) {
+        .start = 0,
+        .end   = slab->size,
+    });
+
+    VkMemoryAllocateInfo minfo = {
+        .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
+        .allocationSize = slab->size,
+    };
+
+    uint32_t typeBits = heap->typeBits ? heap->typeBits : UINT32_MAX;
+    if (heap->usage) {
+        VkBufferCreateInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
+            .size  = slab->size,
+            .usage = heap->usage,
+            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        };
+
+        VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer));
+
+        VkMemoryRequirements reqs;
+        vkGetBufferMemoryRequirements(vk->dev, slab->buffer, &reqs);
+        minfo.allocationSize = reqs.size; // this can be larger than slab->size
+        typeBits &= reqs.memoryTypeBits;  // this can restrict the types
+    }
+
+    VkMemoryType type;
+    int index;
+    if (!find_best_memtype(vk, typeBits, heap->flags, &type, &index))
+        goto error;
+
+    MP_VERBOSE(vk, "Allocating %zu memory of type 0x%x (id %d) in heap %d.\n",
+               slab->size, (unsigned)type.propertyFlags, index, (int)type.heapIndex);
+
+    minfo.memoryTypeIndex = index;
+    VK(vkAllocateMemory(vk->dev, &minfo, MPVK_ALLOCATOR, &slab->mem));
+
+    if (heap->flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)
+        VK(vkMapMemory(vk->dev, slab->mem, 0, VK_WHOLE_SIZE, 0, &slab->data));
+
+    if (slab->buffer)
+        VK(vkBindBufferMemory(vk->dev, slab->buffer, slab->mem, 0));
+
+    return slab;
+
+error:
+    slab_free(vk, slab);
+    return NULL;
+}
+
+static void insert_region(struct vk_slab *slab, struct vk_region region)
+{
+    if (region.start == region.end)
+        return;
+
+    bool big_enough = region_len(region) >= MPVK_HEAP_MINIMUM_REGION_SIZE;
+
+    // Find the index of the first region that comes after this
+    for (int i = 0; i < slab->num_regions; i++) {
+        struct vk_region *r = &slab->regions[i];
+
+        // Check for a few special cases which can be coalesced
+        if (r->end == region.start) {
+            // The new region is at the tail of this region. In addition to
+            // modifying this region, we also need to coalesce all the following
+            // regions for as long as possible
+            r->end = region.end;
+
+            struct vk_region *next = &slab->regions[i+1];
+            while (i+1 < slab->num_regions && r->end == next->start) {
+                r->end = next->end;
+                MP_TARRAY_REMOVE_AT(slab->regions, slab->num_regions, i+1);
+            }
+            return;
+        }
+
+        if (r->start == region.end) {
+            // The new region is at the head of this region. We don't need to
+            // do anything special here - because if this could be further
+            // coalesced backwards, the previous loop iteration would already
+            // have caught it.
+            r->start = region.start;
+            return;
+        }
+
+        if (r->start > region.start) {
+            // The new region comes somewhere before this region, so insert
+            // it into this index in the array.
+            if (big_enough) {
+                MP_TARRAY_INSERT_AT(slab, slab->regions, slab->num_regions,
+                                    i, region);
+            }
+            return;
+        }
+    }
+
+    // If we've reached the end of this loop, then all of the regions
+    // come before the new region, and are disconnected - so append it
+    if (big_enough)
+        MP_TARRAY_APPEND(slab, slab->regions, slab->num_regions, region);
+}
+
+static void heap_uninit(struct mpvk_ctx *vk, struct vk_heap *heap)
+{
+    for (int i = 0; i < heap->num_slabs; i++)
+        slab_free(vk, heap->slabs[i]);
+
+    talloc_free(heap->slabs);
+    *heap = (struct vk_heap){0};
+}
+
+void vk_malloc_init(struct mpvk_ctx *vk)
+{
+    assert(vk->physd);
+    vk->alloc = talloc_zero(NULL, struct vk_malloc);
+    vkGetPhysicalDeviceMemoryProperties(vk->physd, &vk->alloc->props);
+}
+
+void vk_malloc_uninit(struct mpvk_ctx *vk)
+{
+    struct vk_malloc *ma = vk->alloc;
+    if (!ma)
+        return;
+
+    for (int i = 0; i < ma->num_heaps; i++)
+        heap_uninit(vk, &ma->heaps[i]);
+
+    talloc_free(ma);
+    vk->alloc = NULL;
+}
+
+void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice)
+{
+    struct vk_slab *slab = slice.priv;
+    if (!slab)
+        return;
+
+    assert(slab->used >= slice.size);
+    slab->used -= slice.size;
+
+    MP_DBG(vk, "Freeing slice %zu + %zu from slab with size %zu\n",
+           slice.offset, slice.size, slab->size);
+
+    if (slab->dedicated) {
+        // If the slab was purpose-allocated for this memslice, we can just
+        // free it here
+        slab_free(vk, slab);
+    } else {
+        // Return the allocation to the free space map
+        insert_region(slab, (struct vk_region) {
+            .start = slice.offset,
+            .end   = slice.offset + slice.size,
+        });
+    }
+}
+
+// reqs: can be NULL
+static struct vk_heap *find_heap(struct mpvk_ctx *vk, VkBufferUsageFlags usage,
+                                 VkMemoryPropertyFlags flags,
+                                 VkMemoryRequirements *reqs)
+{
+    struct vk_malloc *ma = vk->alloc;
+    int typeBits = reqs ? reqs->memoryTypeBits : 0;
+
+    for (int i = 0; i < ma->num_heaps; i++) {
+        if (ma->heaps[i].usage != usage)
+            continue;
+        if (ma->heaps[i].flags != flags)
+            continue;
+        if (ma->heaps[i].typeBits != typeBits)
+            continue;
+        return &ma->heaps[i];
+    }
+
+    // Not found => add it
+    MP_TARRAY_GROW(ma, ma->heaps, ma->num_heaps + 1);
+    struct vk_heap *heap = &ma->heaps[ma->num_heaps++];
+    *heap = (struct vk_heap) {
+        .usage    = usage,
+        .flags    = flags,
+        .typeBits = typeBits,
+    };
+    return heap;
+}
+
+static inline bool region_fits(struct vk_region r, size_t size, size_t align)
+{
+    return MP_ALIGN_UP(r.start, align) + size <= r.end;
+}
+
+// Finds the best-fitting region in a heap. If the heap is too small or too
+// fragmented, a new slab will be allocated under the hood.
+static bool heap_get_region(struct mpvk_ctx *vk, struct vk_heap *heap,
+                            size_t size, size_t align,
+                            struct vk_slab **out_slab, int *out_index)
+{
+    struct vk_slab *slab = NULL;
+
+    // If the allocation is very big, serve it directly instead of bothering
+    // with the heap
+    if (size > MPVK_HEAP_MAXIMUM_SLAB_SIZE) {
+        slab = slab_alloc(vk, heap, size);
+        *out_slab = slab;
+        *out_index = 0;
+        return !!slab;
+    }
+
+    for (int i = 0; i < heap->num_slabs; i++) {
+        slab = heap->slabs[i];
+        if (slab->size < size)
+            continue;
+
+        // Attempt a best fit search
+        int best = -1;
+        for (int n = 0; n < slab->num_regions; n++) {
+            struct vk_region r = slab->regions[n];
+            if (!region_fits(r, size, align))
+                continue;
+            if (best >= 0 && region_len(r) > region_len(slab->regions[best]))
+                continue;
+            best = n;
+        }
+
+        if (best >= 0) {
+            *out_slab = slab;
+            *out_index = best;
+            return true;
+        }
+    }
+
+    // Otherwise, allocate a new vk_slab and append it to the list.
+    size_t cur_size = MPMAX(size, slab ? slab->size : 0);
+    size_t slab_size = MPVK_HEAP_SLAB_GROWTH_RATE * cur_size;
+    slab_size = MPMAX(MPVK_HEAP_MINIMUM_SLAB_SIZE, slab_size);
+    slab_size = MPMIN(MPVK_HEAP_MAXIMUM_SLAB_SIZE, slab_size);
+    assert(slab_size >= size);
+    slab = slab_alloc(vk, heap, slab_size);
+    if (!slab)
+        return false;
+    MP_TARRAY_APPEND(NULL, heap->slabs, heap->num_slabs, slab);
+
+    // Return the only region there is in a newly allocated slab
+    assert(slab->num_regions == 1);
+    *out_slab = slab;
+    *out_index = 0;
+    return true;
+}
+
+static bool slice_heap(struct mpvk_ctx *vk, struct vk_heap *heap, size_t size,
+                       size_t alignment, struct vk_memslice *out)
+{
+    struct vk_slab *slab;
+    int index;
+    alignment = MP_ALIGN_UP(alignment, vk->limits.bufferImageGranularity);
+    if (!heap_get_region(vk, heap, size, alignment, &slab, &index))
+        return false;
+
+    struct vk_region reg = slab->regions[index];
+    MP_TARRAY_REMOVE_AT(slab->regions, slab->num_regions, index);
+    *out = (struct vk_memslice) {
+        .vkmem = slab->mem,
+        .offset = MP_ALIGN_UP(reg.start, alignment),
+        .size = size,
+        .priv = slab,
+    };
+
+    MP_DBG(vk, "Sub-allocating slice %zu + %zu from slab with size %zu\n",
+           out->offset, out->size, slab->size);
+
+    size_t out_end = out->offset + out->size;
+    insert_region(slab, (struct vk_region) { reg.start, out->offset });
+    insert_region(slab, (struct vk_region) { out_end, reg.end });
+
+    slab->used += size;
+    return true;
+}
+
+bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs,
+                       VkMemoryPropertyFlags flags, struct vk_memslice *out)
+{
+    struct vk_heap *heap = find_heap(vk, 0, flags, &reqs);
+    return slice_heap(vk, heap, reqs.size, reqs.alignment, out);
+}
+
+bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlags bufFlags,
+                      VkMemoryPropertyFlags memFlags, VkDeviceSize size,
+                      VkDeviceSize alignment, struct vk_bufslice *out)
+{
+    struct vk_heap *heap = find_heap(vk, bufFlags, memFlags, NULL);
+    if (!slice_heap(vk, heap, size, alignment, &out->mem))
+        return false;
+
+    struct vk_slab *slab = out->mem.priv;
+    out->buf = slab->buffer;
+    if (slab->data)
+        out->data = (void *)((uintptr_t)slab->data + (ptrdiff_t)out->mem.offset);
+
+    return true;
+}
diff --git a/video/out/vulkan/malloc.h b/video/out/vulkan/malloc.h
new file mode 100644
index 0000000..466c8d8
--- /dev/null
+++ b/video/out/vulkan/malloc.h
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "common.h"
+
+void vk_malloc_init(struct mpvk_ctx *vk);
+void vk_malloc_uninit(struct mpvk_ctx *vk);
+
+// Represents a single "slice" of generic (non-buffer) memory, plus some
+// metadata for accounting. This struct is essentially read-only.
+struct vk_memslice {
+    VkDeviceMemory vkmem;
+    size_t offset;
+    size_t size;
+    void *priv;
+};
+
+void vk_free_memslice(struct mpvk_ctx *vk, struct vk_memslice slice);
+bool vk_malloc_generic(struct mpvk_ctx *vk, VkMemoryRequirements reqs,
+                       VkMemoryPropertyFlags flags, struct vk_memslice *out);
+
+// Represents a single "slice" of a larger buffer
+struct vk_bufslice {
+    struct vk_memslice mem; // must be freed by the user when done
+    VkBuffer buf;           // the buffer this memory was sliced from
+    // For persistently mapped buffers, this points to the first usable byte of
+    // this slice.
+    void *data;
+};
+
+// Allocate a buffer slice. This is more efficient than vk_malloc_generic for
+// when the user needs lots of buffers, since it doesn't require
+// creating/destroying lots of (little) VkBuffers.
+bool vk_malloc_buffer(struct mpvk_ctx *vk, VkBufferUsageFlags bufFlags,
+                      VkMemoryPropertyFlags memFlags, VkDeviceSize size,
+                      VkDeviceSize alignment, struct vk_bufslice *out);
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c
new file mode 100644
index 0000000..f85e30e
--- /dev/null
+++ b/video/out/vulkan/ra_vk.c
@@ -0,0 +1,1747 @@
+#include "video/out/gpu/utils.h"
+#include "video/out/gpu/spirv.h"
+
+#include "ra_vk.h"
+#include "malloc.h"
+
+static struct ra_fns ra_fns_vk;
+
+// For ra.priv
+struct ra_vk {
+    struct mpvk_ctx *vk;
+    struct ra_tex *clear_tex; // stupid hack for clear()
+    struct vk_cmd *cmd;       // currently recording cmd
+};
+
+struct mpvk_ctx *ra_vk_get(struct ra *ra)
+{
+    if (ra->fns != &ra_fns_vk)
+        return NULL;
+
+    struct ra_vk *p = ra->priv;
+    return p->vk;
+}
+
+// Returns a command buffer, or NULL on error
+static struct vk_cmd *vk_require_cmd(struct ra *ra)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    if (!p->cmd)
+        p->cmd = vk_cmd_begin(vk, vk->pool);
+
+    return p->cmd;
+}
+
+// Note: This technically follows the flush() API, but we don't need
+// to expose that (and in fact, it's a bad idea) since we control flushing
+// behavior with ra_vk_present_frame already.
+static bool vk_flush(struct ra *ra, VkSemaphore *done)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    if (p->cmd) {
+        if (!vk_cmd_submit(vk, p->cmd, done))
+            return false;
+        p->cmd = NULL;
+    }
+
+    return true;
+}
+
+// The callback's *priv will always be set to `ra`
+static void vk_callback(struct ra *ra, vk_cb callback, void *arg)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    if (p->cmd) {
+        vk_cmd_callback(p->cmd, callback, ra, arg);
+    } else {
+        vk_dev_callback(vk, callback, ra, arg);
+    }
+}
+
+#define MAKE_LAZY_DESTRUCTOR(fun, argtype)                  \
+    static void fun##_lazy(struct ra *ra, argtype *arg) {   \
+        vk_callback(ra, (vk_cb) fun, arg);                  \
+    }
+
+static void vk_destroy_ra(struct ra *ra)
+{
+    struct ra_vk *p = ra->priv;
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    vk_flush(ra, NULL);
+    mpvk_dev_wait_idle(vk);
+    ra_tex_free(ra, &p->clear_tex);
+
+    talloc_free(ra);
+}
+
+static bool vk_setup_formats(struct ra *ra)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    for (const struct vk_format *vk_fmt = vk_formats; vk_fmt->name; vk_fmt++) {
+        VkFormatProperties prop;
+        vkGetPhysicalDeviceFormatProperties(vk->physd, vk_fmt->iformat, &prop);
+
+        // As a bare minimum, we need to sample from an allocated image
+        VkFormatFeatureFlags flags = prop.optimalTilingFeatures;
+        if (!(flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT))
+            continue;
+
+        VkFormatFeatureFlags linear_bits, render_bits;
+        linear_bits = VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+        render_bits = VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BIT |
+                      VK_FORMAT_FEATURE_COLOR_ATTACHMENT_BLEND_BIT;
+
+        struct ra_format *fmt = talloc_zero(ra, struct ra_format);
+        *fmt = (struct ra_format) {
+            .name            = vk_fmt->name,
+            .priv            = (void *)vk_fmt,
+            .ctype           = vk_fmt->ctype,
+            .ordered         = !vk_fmt->fucked_order,
+            .num_components  = vk_fmt->components,
+            .pixel_size      = vk_fmt->bytes,
+            .linear_filter   = !!(flags & linear_bits),
+            .renderable      = !!(flags & render_bits),
+        };
+
+        for (int i = 0; i < 4; i++)
+            fmt->component_size[i] = fmt->component_depth[i] = vk_fmt->bits[i];
+
+        fmt->glsl_format = ra_fmt_glsl_format(fmt);
+
+        MP_TARRAY_APPEND(ra, ra->formats, ra->num_formats, fmt);
+    }
+
+    // Populate some other capabilities related to formats while we're at it
+    VkImageType imgType[3] = {
+        VK_IMAGE_TYPE_1D,
+        VK_IMAGE_TYPE_2D,
+        VK_IMAGE_TYPE_3D
+    };
+
+    // R8_UNORM is supported on literally every single vulkan implementation
+    const VkFormat testfmt = VK_FORMAT_R8_UNORM;
+
+    for (int d = 0; d < 3; d++) {
+        VkImageFormatProperties iprop;
+        VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
+                testfmt, imgType[d], VK_IMAGE_TILING_OPTIMAL,
+                VK_IMAGE_USAGE_SAMPLED_BIT, 0, &iprop);
+
+        switch (imgType[d]) {
+        case VK_IMAGE_TYPE_1D:
+            if (res == VK_SUCCESS)
+                ra->caps |= RA_CAP_TEX_1D;
+            break;
+        case VK_IMAGE_TYPE_2D:
+            // 2D formats must be supported by RA, so ensure this is the case
+            VK_ASSERT(res, "Querying 2D format limits");
+            ra->max_texture_wh = MPMIN(iprop.maxExtent.width, iprop.maxExtent.height);
+            break;
+        case VK_IMAGE_TYPE_3D:
+            if (res == VK_SUCCESS)
+                ra->caps |= RA_CAP_TEX_3D;
+            break;
+        }
+    }
+
+    // RA_CAP_BLIT implies both blitting between images as well as blitting
+    // directly to the swapchain image, so check for all three operations
+    bool blittable = true;
+    VkFormatProperties prop;
+    vkGetPhysicalDeviceFormatProperties(vk->physd, testfmt, &prop);
+    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_SRC_BIT))
+        blittable = false;
+    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT))
+        blittable = false;
+
+    vkGetPhysicalDeviceFormatProperties(vk->physd, vk->surf_format.format, &prop);
+    if (!(prop.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT))
+        blittable = false;
+
+    if (blittable)
+        ra->caps |= RA_CAP_BLIT;
+
+    return true;
+
+error:
+    return false;
+}
+
+static struct ra_fns ra_fns_vk;
+
+struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
+{
+    assert(vk->dev);
+    assert(vk->alloc);
+
+    struct ra *ra = talloc_zero(NULL, struct ra);
+    ra->log = log;
+    ra->fns = &ra_fns_vk;
+
+    struct ra_vk *p = ra->priv = talloc_zero(ra, struct ra_vk);
+    p->vk = vk;
+
+    ra->caps |= vk->spirv->ra_caps;
+    ra->glsl_version = vk->spirv->glsl_version;
+    ra->glsl_vulkan = true;
+    ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
+    ra->max_pushc_size = vk->limits.maxPushConstantsSize;
+
+    if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT)
+        ra->caps |= RA_CAP_COMPUTE;
+
+    if (!vk_setup_formats(ra))
+        goto error;
+
+    // UBO support is required
+    ra->caps |= RA_CAP_BUF_RO | RA_CAP_FRAGCOORD;
+
+    // textureGather is only supported in GLSL 400+
+    if (ra->glsl_version >= 400)
+        ra->caps |= RA_CAP_GATHER;
+
+    // Try creating a shader storage buffer
+    struct ra_buf_params ssbo_params = {
+        .type = RA_BUF_TYPE_SHADER_STORAGE,
+        .size = 16,
+    };
+
+    struct ra_buf *ssbo = ra_buf_create(ra, &ssbo_params);
+    if (ssbo) {
+        ra->caps |= RA_CAP_BUF_RW;
+        ra_buf_free(ra, &ssbo);
+    }
+
+    // To support clear() by region, we need to allocate a dummy 1x1 image that
+    // will be used as the source of blit operations
+    struct ra_tex_params clear_params = {
+        .dimensions = 1, // no point in using a 2D image if height = 1
+        .w = 1,
+        .h = 1,
+        .d = 1,
+        .format = ra_find_float16_format(ra, 4),
+        .blit_src = 1,
+        .host_mutable = 1,
+    };
+
+    p->clear_tex = ra_tex_create(ra, &clear_params);
+    if (!p->clear_tex) {
+        MP_ERR(ra, "Failed creating 1x1 dummy texture for clear()!\n");
+        goto error;
+    }
+
+    return ra;
+
+error:
+    vk_destroy_ra(ra);
+    return NULL;
+}
+
+// Boilerplate wrapper around vkCreateRenderPass to ensure passes remain
+// compatible
+static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
+                                      bool load_fbo, VkRenderPass *out)
+{
+    struct vk_format *vk_fmt = fmt->priv;
+    assert(fmt->renderable);
+
+    VkRenderPassCreateInfo rinfo = {
+        .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO,
+        .attachmentCount = 1,
+        .pAttachments = &(VkAttachmentDescription) {
+            .format = vk_fmt->iformat,
+            .samples = VK_SAMPLE_COUNT_1_BIT,
+            .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD
+                               : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+            .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
+            .initialLayout = load_fbo ? VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
+                                      : VK_IMAGE_LAYOUT_UNDEFINED,
+            .finalLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+        },
+        .subpassCount = 1,
+        .pSubpasses = &(VkSubpassDescription) {
+            .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS,
+            .colorAttachmentCount = 1,
+            .pColorAttachments = &(VkAttachmentReference) {
+                .attachment = 0,
+                .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+            },
+        },
+    };
+
+    return vkCreateRenderPass(dev, &rinfo, MPVK_ALLOCATOR, out);
+}
+
+// For ra_tex.priv
+struct ra_tex_vk {
+    bool external_img;
+    VkImageType type;
+    VkImage img;
+    struct vk_memslice mem;
+    // for sampling
+    VkImageView view;
+    VkSampler sampler;
+    // for rendering
+    VkFramebuffer framebuffer;
+    VkRenderPass dummyPass;
+    // for uploading
+    struct ra_buf_pool pbo;
+    // "current" metadata, can change during the course of execution
+    VkImageLayout current_layout;
+    VkPipelineStageFlags current_stage;
+    VkAccessFlags current_access;
+};
+
+// Small helper to ease image barrier creation. if `discard` is set, the contents
+// of the image will be undefined after the barrier
+static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk,
+                        VkPipelineStageFlags newStage, VkAccessFlags newAccess,
+                        VkImageLayout newLayout, bool discard)
+{
+    VkImageMemoryBarrier imgBarrier = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
+        .oldLayout = tex_vk->current_layout,
+        .newLayout = newLayout,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .srcAccessMask = tex_vk->current_access,
+        .dstAccessMask = newAccess,
+        .image = tex_vk->img,
+        .subresourceRange = vk_range,
+    };
+
+    if (discard) {
+        imgBarrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+        imgBarrier.srcAccessMask = 0;
+    }
+
+    if (imgBarrier.oldLayout != imgBarrier.newLayout ||
+        imgBarrier.srcAccessMask != imgBarrier.dstAccessMask)
+    {
+        vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0,
+                             0, NULL, 0, NULL, 1, &imgBarrier);
+    }
+
+    tex_vk->current_stage = newStage;
+    tex_vk->current_layout = newLayout;
+    tex_vk->current_access = newAccess;
+}
+
+static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex)
+{
+    if (!tex)
+        return;
+
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct ra_tex_vk *tex_vk = tex->priv;
+
+    ra_buf_pool_uninit(ra, &tex_vk->pbo);
+    vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR);
+    vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR);
+    vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR);
+    vkDestroyImageView(vk->dev, tex_vk->view, MPVK_ALLOCATOR);
+    if (!tex_vk->external_img) {
+        vkDestroyImage(vk->dev, tex_vk->img, MPVK_ALLOCATOR);
+        vk_free_memslice(vk, tex_vk->mem);
+    }
+
+    talloc_free(tex);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_tex_destroy, struct ra_tex);
+
+// Initializes non-VkImage values like the image view, samplers, etc.
+static bool vk_init_image(struct ra *ra, struct ra_tex *tex)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    struct ra_tex_params *params = &tex->params;
+    struct ra_tex_vk *tex_vk = tex->priv;
+    assert(tex_vk->img);
+
+    tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    tex_vk->current_access = 0;
+
+    if (params->render_src || params->render_dst) {
+        static const VkImageViewType viewType[] = {
+            [VK_IMAGE_TYPE_1D] = VK_IMAGE_VIEW_TYPE_1D,
+            [VK_IMAGE_TYPE_2D] = VK_IMAGE_VIEW_TYPE_2D,
+            [VK_IMAGE_TYPE_3D] = VK_IMAGE_VIEW_TYPE_3D,
+        };
+
+        const struct vk_format *fmt = params->format->priv;
+        VkImageViewCreateInfo vinfo = {
+            .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
+            .image = tex_vk->img,
+            .viewType = viewType[tex_vk->type],
+            .format = fmt->iformat,
+            .subresourceRange = vk_range,
+        };
+
+        VK(vkCreateImageView(vk->dev, &vinfo, MPVK_ALLOCATOR, &tex_vk->view));
+    }
+
+    if (params->render_src) {
+        assert(params->format->linear_filter || !params->src_linear);
+        VkFilter filter = params->src_linear
+            ? VK_FILTER_LINEAR
+            : VK_FILTER_NEAREST;
+        VkSamplerAddressMode wrap = params->src_repeat
+            ? VK_SAMPLER_ADDRESS_MODE_REPEAT
+            : VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_EDGE;
+        VkSamplerCreateInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO,
+            .magFilter = filter,
+            .minFilter = filter,
+            .addressModeU = wrap,
+            .addressModeV = wrap,
+            .addressModeW = wrap,
+            .maxAnisotropy = 1.0,
+        };
+
+        VK(vkCreateSampler(vk->dev, &sinfo, MPVK_ALLOCATOR, &tex_vk->sampler));
+    }
+
+    if (params->render_dst) {
+        // Framebuffers need to be created against a specific render pass
+        // layout, so we need to temporarily create a skeleton/dummy render
+        // pass for vulkan to figure out the compatibility
+        VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass));
+
+        VkFramebufferCreateInfo finfo = {
+            .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
+            .renderPass = tex_vk->dummyPass,
+            .attachmentCount = 1,
+            .pAttachments = &tex_vk->view,
+            .width = tex->params.w,
+            .height = tex->params.h,
+            .layers = 1,
+        };
+
+        VK(vkCreateFramebuffer(vk->dev, &finfo, MPVK_ALLOCATOR,
+                               &tex_vk->framebuffer));
+
+        // NOTE: Normally we would free the dummyPass again here, but a bug
+        // in the nvidia vulkan driver causes a segfault if you do.
+    }
+
+    return true;
+
+error:
+    return false;
+}
+
+static struct ra_tex *vk_tex_create(struct ra *ra,
+                                    const struct ra_tex_params *params)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
+    tex->params = *params;
+    tex->params.initial_data = NULL;
+
+    struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+
+    const struct vk_format *fmt = params->format->priv;
+    switch (params->dimensions) {
+    case 1: tex_vk->type = VK_IMAGE_TYPE_1D; break;
+    case 2: tex_vk->type = VK_IMAGE_TYPE_2D; break;
+    case 3: tex_vk->type = VK_IMAGE_TYPE_3D; break;
+    default: abort();
+    }
+
+    VkImageUsageFlags usage = 0;
+    if (params->render_src)
+        usage |= VK_IMAGE_USAGE_SAMPLED_BIT;
+    if (params->render_dst)
+        usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
+    if (params->storage_dst)
+        usage |= VK_IMAGE_USAGE_STORAGE_BIT;
+    if (params->blit_src)
+        usage |= VK_IMAGE_USAGE_TRANSFER_SRC_BIT;
+    if (params->host_mutable || params->blit_dst || params->initial_data)
+        usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+
+    // Double-check image usage support and fail immediately if invalid
+    VkImageFormatProperties iprop;
+    VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
+            fmt->iformat, tex_vk->type, VK_IMAGE_TILING_OPTIMAL, usage, 0,
+            &iprop);
+    if (res == VK_ERROR_FORMAT_NOT_SUPPORTED) {
+        return NULL;
+    } else {
+        VK_ASSERT(res, "Querying image format properties");
+    }
+
+    VkFormatProperties prop;
+    vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop);
+    VkFormatFeatureFlags flags = prop.optimalTilingFeatures;
+
+    bool has_blit_src   = flags & VK_FORMAT_FEATURE_BLIT_SRC_BIT,
+         has_src_linear = flags & VK_FORMAT_FEATURE_SAMPLED_IMAGE_FILTER_LINEAR_BIT;
+
+    if (params->w > iprop.maxExtent.width ||
+        params->h > iprop.maxExtent.height ||
+        params->d > iprop.maxExtent.depth ||
+        (params->blit_src && !has_blit_src) ||
+        (params->src_linear && !has_src_linear))
+    {
+        return NULL;
+    }
+
+    VkImageCreateInfo iinfo = {
+        .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+        .imageType = tex_vk->type,
+        .format = fmt->iformat,
+        .extent = (VkExtent3D) { params->w, params->h, params->d },
+        .mipLevels = 1,
+        .arrayLayers = 1,
+        .samples = VK_SAMPLE_COUNT_1_BIT,
+        .tiling = VK_IMAGE_TILING_OPTIMAL,
+        .usage = usage,
+        .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
+        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = 1,
+        .pQueueFamilyIndices = &vk->pool->qf,
+    };
+
+    VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img));
+
+    VkMemoryPropertyFlags memFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+    VkMemoryRequirements reqs;
+    vkGetImageMemoryRequirements(vk->dev, tex_vk->img, &reqs);
+
+    struct vk_memslice *mem = &tex_vk->mem;
+    if (!vk_malloc_generic(vk, reqs, memFlags, mem))
+        goto error;
+
+    VK(vkBindImageMemory(vk->dev, tex_vk->img, mem->vkmem, mem->offset));
+
+    if (!vk_init_image(ra, tex))
+        goto error;
+
+    if (params->initial_data) {
+        struct ra_tex_upload_params ul_params = {
+            .tex = tex,
+            .invalidate = true,
+            .src = params->initial_data,
+            .stride = params->w * fmt->bytes,
+        };
+        if (!ra->fns->tex_upload(ra, &ul_params))
+            goto error;
+    }
+
+    return tex;
+
+error:
+    vk_tex_destroy(ra, tex);
+    return NULL;
+}
+
+struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg,
+                                        VkSwapchainCreateInfoKHR info)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct ra_tex *tex = NULL;
+
+    const struct ra_format *format = NULL;
+    for (int i = 0; i < ra->num_formats; i++) {
+        const struct vk_format *fmt = ra->formats[i]->priv;
+        if (fmt->iformat == vk->surf_format.format) {
+            format = ra->formats[i];
+            break;
+        }
+    }
+
+    if (!format) {
+        MP_ERR(ra, "Could not find ra_format suitable for wrapped swchain image "
+                   "with surface format 0x%x\n", vk->surf_format.format);
+        goto error;
+    }
+
+    tex = talloc_zero(NULL, struct ra_tex);
+    tex->params = (struct ra_tex_params) {
+        .format = format,
+        .dimensions = 2,
+        .w = info.imageExtent.width,
+        .h = info.imageExtent.height,
+        .d = 1,
+        .blit_src    = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_SRC_BIT),
+        .blit_dst    = !!(info.imageUsage & VK_IMAGE_USAGE_TRANSFER_DST_BIT),
+        .render_src  = !!(info.imageUsage & VK_IMAGE_USAGE_SAMPLED_BIT),
+        .render_dst  = !!(info.imageUsage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT),
+        .storage_dst = !!(info.imageUsage & VK_IMAGE_USAGE_STORAGE_BIT),
+    };
+
+    struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+    tex_vk->type = VK_IMAGE_TYPE_2D;
+    tex_vk->external_img = true;
+    tex_vk->img = vkimg;
+
+    if (!vk_init_image(ra, tex))
+        goto error;
+
+    return tex;
+
+error:
+    vk_tex_destroy(ra, tex);
+    return NULL;
+}
+
+// For ra_buf.priv
+struct ra_buf_vk {
+    struct vk_bufslice slice;
+    int refcount; // 1 = object allocated but not in use, > 1 = in use
+    bool needsflush;
+    // "current" metadata, can change during course of execution
+    VkPipelineStageFlags current_stage;
+    VkAccessFlags current_access;
+};
+
+static void vk_buf_deref(struct ra *ra, struct ra_buf *buf)
+{
+    if (!buf)
+        return;
+
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    if (--buf_vk->refcount == 0) {
+        vk_free_memslice(vk, buf_vk->slice.mem);
+        talloc_free(buf);
+    }
+}
+
+static void buf_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_buf *buf,
+                        VkPipelineStageFlags newStage,
+                        VkAccessFlags newAccess, int offset, size_t size)
+{
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    VkBufferMemoryBarrier buffBarrier = {
+        .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
+        .srcAccessMask = buf_vk->current_access,
+        .dstAccessMask = newAccess,
+        .buffer = buf_vk->slice.buf,
+        .offset = offset,
+        .size = size,
+    };
+
+    if (buf_vk->needsflush || buf->params.host_mapped) {
+        buffBarrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+        buf_vk->current_stage = VK_PIPELINE_STAGE_HOST_BIT;
+        buf_vk->needsflush = false;
+    }
+
+    if (buffBarrier.srcAccessMask != buffBarrier.dstAccessMask) {
+        vkCmdPipelineBarrier(cmd->buf, buf_vk->current_stage, newStage, 0,
+                             0, NULL, 1, &buffBarrier, 0, NULL);
+    }
+
+    buf_vk->current_stage = newStage;
+    buf_vk->current_access = newAccess;
+    buf_vk->refcount++;
+    vk_cmd_callback(cmd, (vk_cb) vk_buf_deref, ra, buf);
+}
+
+#define vk_buf_destroy vk_buf_deref
+MAKE_LAZY_DESTRUCTOR(vk_buf_destroy, struct ra_buf);
+
+static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
+                          const void *data, size_t size)
+{
+    assert(buf->params.host_mutable || buf->params.initial_data);
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    // For host-mapped buffers, we can just directly memcpy the buffer contents.
+    // Otherwise, we can update the buffer from the GPU using a command buffer.
+    if (buf_vk->slice.data) {
+        assert(offset + size <= buf->params.size);
+        uintptr_t addr = (uintptr_t)buf_vk->slice.data + offset;
+        memcpy((void *)addr, data, size);
+        buf_vk->needsflush = true;
+    } else {
+        struct vk_cmd *cmd = vk_require_cmd(ra);
+        if (!cmd) {
+            MP_ERR(ra, "Failed updating buffer!\n");
+            return;
+        }
+
+        buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                    VK_ACCESS_TRANSFER_WRITE_BIT, offset, size);
+
+        VkDeviceSize bufOffset = buf_vk->slice.mem.offset + offset;
+        assert(bufOffset == MP_ALIGN_UP(bufOffset, 4));
+        vkCmdUpdateBuffer(cmd->buf, buf_vk->slice.buf, bufOffset, size, data);
+    }
+}
+
+static struct ra_buf *vk_buf_create(struct ra *ra,
+                                    const struct ra_buf_params *params)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    struct ra_buf *buf = talloc_zero(NULL, struct ra_buf);
+    buf->params = *params;
+
+    struct ra_buf_vk *buf_vk = buf->priv = talloc_zero(buf, struct ra_buf_vk);
+    buf_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    buf_vk->current_access = 0;
+    buf_vk->refcount = 1;
+
+    VkBufferUsageFlags bufFlags = 0;
+    VkMemoryPropertyFlags memFlags = 0;
+    VkDeviceSize align = 4; // alignment 4 is needed for buf_update
+
+    switch (params->type) {
+    case RA_BUF_TYPE_TEX_UPLOAD:
+        bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        break;
+    case RA_BUF_TYPE_UNIFORM:
+        bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        align = MP_ALIGN_UP(align, vk->limits.minUniformBufferOffsetAlignment);
+        break;
+    case RA_BUF_TYPE_SHADER_STORAGE:
+        bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment);
+        break;
+    case RA_BUF_TYPE_VERTEX:
+        bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
+        memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
+        break;
+    default: abort();
+    }
+
+    if (params->host_mutable || params->initial_data) {
+        bufFlags |= VK_BUFFER_USAGE_TRANSFER_DST_BIT;
+        align = MP_ALIGN_UP(align, vk->limits.optimalBufferCopyOffsetAlignment);
+    }
+
+    if (params->host_mapped) {
+        memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                    VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
+                    VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+    }
+
+    if (!vk_malloc_buffer(vk, bufFlags, memFlags, params->size, align,
+                          &buf_vk->slice))
+    {
+        goto error;
+    }
+
+    if (params->host_mapped)
+        buf->data = buf_vk->slice.data;
+
+    if (params->initial_data)
+        vk_buf_update(ra, buf, 0, params->initial_data, params->size);
+
+    buf->params.initial_data = NULL; // do this after vk_buf_update
+    return buf;
+
+error:
+    vk_buf_destroy(ra, buf);
+    return NULL;
+}
+
+static bool vk_buf_poll(struct ra *ra, struct ra_buf *buf)
+{
+    struct ra_buf_vk *buf_vk = buf->priv;
+    return buf_vk->refcount == 1;
+}
+
+static bool vk_tex_upload(struct ra *ra,
+                          const struct ra_tex_upload_params *params)
+{
+    struct ra_tex *tex = params->tex;
+    struct ra_tex_vk *tex_vk = tex->priv;
+
+    if (!params->buf)
+        return ra_tex_upload_pbo(ra, &tex_vk->pbo, params);
+
+    assert(!params->src);
+    assert(params->buf);
+    struct ra_buf *buf = params->buf;
+    struct ra_buf_vk *buf_vk = buf->priv;
+
+    VkBufferImageCopy region = {
+        .bufferOffset = buf_vk->slice.mem.offset + params->buf_offset,
+        .bufferRowLength = tex->params.w,
+        .bufferImageHeight = tex->params.h,
+        .imageSubresource = vk_layers,
+        .imageExtent = (VkExtent3D){tex->params.w, tex->params.h, tex->params.d},
+    };
+
+    if (tex->params.dimensions == 2) {
+        int pix_size = tex->params.format->pixel_size;
+        region.bufferRowLength = params->stride / pix_size;
+        if (region.bufferRowLength * pix_size != params->stride) {
+            MP_ERR(ra, "Texture upload strides must be a multiple of the texel "
+                       "size!\n");
+            goto error;
+        }
+
+        if (params->rc) {
+            struct mp_rect *rc = params->rc;
+            region.imageOffset = (VkOffset3D){rc->x0, rc->y0, 0};
+            region.imageExtent = (VkExtent3D){mp_rect_w(*rc), mp_rect_h(*rc), 1};
+        }
+    }
+
+    uint64_t size = region.bufferRowLength * region.bufferImageHeight *
+                    region.imageExtent.depth;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        goto error;
+
+    buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size);
+
+    tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_WRITE_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                params->invalidate);
+
+    vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img,
+                           tex_vk->current_layout, 1, &region);
+
+    return true;
+
+error:
+    return false;
+}
+
+#define MPVK_NUM_DS MPVK_MAX_STREAMING_DEPTH
+
+// For ra_renderpass.priv
+struct ra_renderpass_vk {
+    // Pipeline / render pass
+    VkPipeline pipe;
+    VkPipelineLayout pipeLayout;
+    VkRenderPass renderPass;
+    // Descriptor set (bindings)
+    VkDescriptorSetLayout dsLayout;
+    VkDescriptorPool dsPool;
+    VkDescriptorSet dss[MPVK_NUM_DS];
+    int dindex;
+    // Vertex buffers (vertices)
+    struct ra_buf_pool vbo;
+
+    // For updating
+    VkWriteDescriptorSet *dswrite;
+    VkDescriptorImageInfo *dsiinfo;
+    VkDescriptorBufferInfo *dsbinfo;
+};
+
+static void vk_renderpass_destroy(struct ra *ra, struct ra_renderpass *pass)
+{
+    if (!pass)
+        return;
+
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct ra_renderpass_vk *pass_vk = pass->priv;
+
+    ra_buf_pool_uninit(ra, &pass_vk->vbo);
+    vkDestroyPipeline(vk->dev, pass_vk->pipe, MPVK_ALLOCATOR);
+    vkDestroyRenderPass(vk->dev, pass_vk->renderPass, MPVK_ALLOCATOR);
+    vkDestroyPipelineLayout(vk->dev, pass_vk->pipeLayout, MPVK_ALLOCATOR);
+    vkDestroyDescriptorPool(vk->dev, pass_vk->dsPool, MPVK_ALLOCATOR);
+    vkDestroyDescriptorSetLayout(vk->dev, pass_vk->dsLayout, MPVK_ALLOCATOR);
+
+    talloc_free(pass);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_renderpass_destroy, struct ra_renderpass);
+
+static const VkDescriptorType dsType[] = {
+    [RA_VARTYPE_TEX]    = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER,
+    [RA_VARTYPE_IMG_W]  = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE,
+    [RA_VARTYPE_BUF_RO] = VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+    [RA_VARTYPE_BUF_RW] = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+};
+
+static bool vk_get_input_format(struct ra *ra, struct ra_renderpass_input *inp,
+                                VkFormat *out_fmt)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    enum ra_ctype ctype;
+    switch (inp->type) {
+    case RA_VARTYPE_FLOAT:      ctype = RA_CTYPE_FLOAT; break;
+    case RA_VARTYPE_BYTE_UNORM: ctype = RA_CTYPE_UNORM; break;
+    default: abort();
+    }
+
+    assert(inp->dim_m == 1);
+    for (const struct vk_format *fmt = vk_formats; fmt->name; fmt++) {
+        if (fmt->ctype != ctype)
+            continue;
+        if (fmt->components != inp->dim_v)
+            continue;
+        if (fmt->bytes != ra_renderpass_input_layout(inp).size)
+            continue;
+
+        // Ensure this format is valid for vertex attributes
+        VkFormatProperties prop;
+        vkGetPhysicalDeviceFormatProperties(vk->physd, fmt->iformat, &prop);
+        if (!(prop.bufferFeatures & VK_FORMAT_FEATURE_VERTEX_BUFFER_BIT))
+            continue;
+
+        *out_fmt = fmt->iformat;
+        return true;
+    }
+
+    return false;
+}
+
+static const char vk_cache_magic[4] = {'R','A','V','K'};
+static const int vk_cache_version = 2;
+
+struct vk_cache_header {
+    char magic[sizeof(vk_cache_magic)];
+    int cache_version;
+    char compiler[SPIRV_NAME_MAX_LEN];
+    int compiler_version;
+    size_t vert_spirv_len;
+    size_t frag_spirv_len;
+    size_t comp_spirv_len;
+    size_t pipecache_len;
+};
+
+static bool vk_use_cached_program(const struct ra_renderpass_params *params,
+                                  const struct spirv_compiler *spirv,
+                                  struct bstr *vert_spirv,
+                                  struct bstr *frag_spirv,
+                                  struct bstr *comp_spirv,
+                                  struct bstr *pipecache)
+{
+    struct bstr cache = params->cached_program;
+    if (cache.len < sizeof(struct vk_cache_header))
+        return false;
+
+    struct vk_cache_header *header = (struct vk_cache_header *)cache.start;
+    cache = bstr_cut(cache, sizeof(*header));
+
+    if (strncmp(header->magic, vk_cache_magic, sizeof(vk_cache_magic)) != 0)
+        return false;
+    if (header->cache_version != vk_cache_version)
+        return false;
+    if (strncmp(header->compiler, spirv->name, sizeof(header->compiler)) != 0)
+        return false;
+    if (header->compiler_version != spirv->compiler_version)
+        return false;
+
+#define GET(ptr) \
+    if (cache.len < header->ptr##_len)                      \
+            return false;                                   \
+        *ptr = bstr_splice(cache, 0, header->ptr##_len);    \
+        cache = bstr_cut(cache, ptr->len);
+
+    GET(vert_spirv);
+    GET(frag_spirv);
+    GET(comp_spirv);
+    GET(pipecache);
+    return true;
+}
+
+static VkResult vk_compile_glsl(struct ra *ra, void *tactx,
+                                enum glsl_shader type, const char *glsl,
+                                struct bstr *spirv)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    VkResult ret = VK_SUCCESS;
+    int msgl = MSGL_DEBUG;
+
+    if (!vk->spirv->fns->compile_glsl(vk->spirv, tactx, type, glsl, spirv)) {
+        ret = VK_ERROR_INVALID_SHADER_NV;
+        msgl = MSGL_ERR;
+    }
+
+    static const char *shader_names[] = {
+        [GLSL_SHADER_VERTEX]   = "vertex",
+        [GLSL_SHADER_FRAGMENT] = "fragment",
+        [GLSL_SHADER_COMPUTE]  = "compute",
+    };
+
+    if (mp_msg_test(ra->log, msgl)) {
+        MP_MSG(ra, msgl, "%s shader source:\n", shader_names[type]);
+        mp_log_source(ra->log, msgl, glsl);
+    }
+    return ret;
+}
+
+static const VkShaderStageFlags stageFlags[] = {
+    [RA_RENDERPASS_TYPE_RASTER]  = VK_SHADER_STAGE_FRAGMENT_BIT,
+    [RA_RENDERPASS_TYPE_COMPUTE] = VK_SHADER_STAGE_COMPUTE_BIT,
+};
+
+static struct ra_renderpass *vk_renderpass_create(struct ra *ra,
+                                    const struct ra_renderpass_params *params)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    bool success = false;
+    assert(vk->spirv);
+
+    struct ra_renderpass *pass = talloc_zero(NULL, struct ra_renderpass);
+    pass->params = *ra_renderpass_params_copy(pass, params);
+    pass->params.cached_program = (bstr){0};
+    struct ra_renderpass_vk *pass_vk = pass->priv =
+        talloc_zero(pass, struct ra_renderpass_vk);
+
+    // temporary allocations/objects
+    void *tmp = talloc_new(NULL);
+    VkPipelineCache pipeCache = NULL;
+    VkShaderModule vert_shader = NULL;
+    VkShaderModule frag_shader = NULL;
+    VkShaderModule comp_shader = NULL;
+
+    static int dsCount[RA_VARTYPE_COUNT] = {0};
+    VkDescriptorSetLayoutBinding *bindings = NULL;
+    int num_bindings = 0;
+
+    for (int i = 0; i < params->num_inputs; i++) {
+        struct ra_renderpass_input *inp = &params->inputs[i];
+        switch (inp->type) {
+        case RA_VARTYPE_TEX:
+        case RA_VARTYPE_IMG_W:
+        case RA_VARTYPE_BUF_RO:
+        case RA_VARTYPE_BUF_RW: {
+            VkDescriptorSetLayoutBinding desc = {
+                .binding = inp->binding,
+                .descriptorType = dsType[inp->type],
+                .descriptorCount = 1,
+                .stageFlags = stageFlags[params->type],
+            };
+
+            MP_TARRAY_APPEND(tmp, bindings, num_bindings, desc);
+            dsCount[inp->type]++;
+            break;
+        }
+        default: abort();
+        }
+    }
+
+    VkDescriptorPoolSize *dsPoolSizes = NULL;
+    int poolSizeCount = 0;
+
+    for (enum ra_vartype t = 0; t < RA_VARTYPE_COUNT; t++) {
+        if (dsCount[t] > 0) {
+            VkDescriptorPoolSize dssize = {
+                .type = dsType[t],
+                .descriptorCount = dsCount[t] * MPVK_NUM_DS,
+            };
+
+            MP_TARRAY_APPEND(tmp, dsPoolSizes, poolSizeCount, dssize);
+        }
+    }
+
+    VkDescriptorPoolCreateInfo pinfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
+        .maxSets = MPVK_NUM_DS,
+        .pPoolSizes = dsPoolSizes,
+        .poolSizeCount = poolSizeCount,
+    };
+
+    VK(vkCreateDescriptorPool(vk->dev, &pinfo, MPVK_ALLOCATOR, &pass_vk->dsPool));
+
+    pass_vk->dswrite = talloc_array(pass, VkWriteDescriptorSet, num_bindings);
+    pass_vk->dsiinfo = talloc_array(pass, VkDescriptorImageInfo, num_bindings);
+    pass_vk->dsbinfo = talloc_array(pass, VkDescriptorBufferInfo, num_bindings);
+
+    VkDescriptorSetLayoutCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
+        .pBindings = bindings,
+        .bindingCount = num_bindings,
+    };
+
+    VK(vkCreateDescriptorSetLayout(vk->dev, &dinfo, MPVK_ALLOCATOR,
+                                   &pass_vk->dsLayout));
+
+    VkDescriptorSetLayout layouts[MPVK_NUM_DS];
+    for (int i = 0; i < MPVK_NUM_DS; i++)
+        layouts[i] = pass_vk->dsLayout;
+
+    VkDescriptorSetAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
+        .descriptorPool = pass_vk->dsPool,
+        .descriptorSetCount = MPVK_NUM_DS,
+        .pSetLayouts = layouts,
+    };
+
+    VK(vkAllocateDescriptorSets(vk->dev, &ainfo, pass_vk->dss));
+
+    VkPipelineLayoutCreateInfo linfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
+        .setLayoutCount = 1,
+        .pSetLayouts = &pass_vk->dsLayout,
+        .pushConstantRangeCount = params->push_constants_size ? 1 : 0,
+        .pPushConstantRanges = &(VkPushConstantRange){
+            .stageFlags = stageFlags[params->type],
+            .offset = 0,
+            .size = params->push_constants_size,
+        },
+    };
+
+    VK(vkCreatePipelineLayout(vk->dev, &linfo, MPVK_ALLOCATOR,
+                              &pass_vk->pipeLayout));
+
+    struct bstr vert = {0}, frag = {0}, comp = {0}, pipecache = {0};
+    if (vk_use_cached_program(params, vk->spirv, &vert, &frag, &comp, &pipecache)) {
+        MP_VERBOSE(ra, "Using cached SPIR-V and VkPipeline.\n");
+    } else {
+        pipecache.len = 0;
+        switch (params->type) {
+        case RA_RENDERPASS_TYPE_RASTER:
+            VK(vk_compile_glsl(ra, tmp, GLSL_SHADER_VERTEX,
+                               params->vertex_shader, &vert));
+            VK(vk_compile_glsl(ra, tmp, GLSL_SHADER_FRAGMENT,
+                               params->frag_shader, &frag));
+            comp.len = 0;
+            break;
+        case RA_RENDERPASS_TYPE_COMPUTE:
+            VK(vk_compile_glsl(ra, tmp, GLSL_SHADER_COMPUTE,
+                               params->compute_shader, &comp));
+            frag.len = 0;
+            vert.len = 0;
+            break;
+        }
+    }
+
+    VkPipelineCacheCreateInfo pcinfo = {
+        .sType = VK_STRUCTURE_TYPE_PIPELINE_CACHE_CREATE_INFO,
+        .pInitialData = pipecache.start,
+        .initialDataSize = pipecache.len,
+    };
+
+    VK(vkCreatePipelineCache(vk->dev, &pcinfo, MPVK_ALLOCATOR, &pipeCache));
+
+    VkShaderModuleCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,
+    };
+
+    switch (params->type) {
+    case RA_RENDERPASS_TYPE_RASTER: {
+        sinfo.pCode = (uint32_t *)vert.start;
+        sinfo.codeSize = vert.len;
+        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &vert_shader));
+
+        sinfo.pCode = (uint32_t *)frag.start;
+        sinfo.codeSize = frag.len;
+        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &frag_shader));
+
+        VkVertexInputAttributeDescription *attrs = talloc_array(tmp,
+                VkVertexInputAttributeDescription, params->num_vertex_attribs);
+
+        for (int i = 0; i < params->num_vertex_attribs; i++) {
+            struct ra_renderpass_input *inp = &params->vertex_attribs[i];
+            attrs[i] = (VkVertexInputAttributeDescription) {
+                .location = i,
+                .binding = 0,
+                .offset = inp->offset,
+            };
+
+            if (!vk_get_input_format(ra, inp, &attrs[i].format)) {
+                MP_ERR(ra, "No suitable VkFormat for vertex attrib '%s'!\n",
+                       inp->name);
+                goto error;
+            }
+        }
+        VK(vk_create_render_pass(vk->dev, params->target_format,
+                                 params->enable_blend, &pass_vk->renderPass));
+
+        static const VkBlendFactor blendFactors[] = {
+            [RA_BLEND_ZERO]                = VK_BLEND_FACTOR_ZERO,
+            [RA_BLEND_ONE]                 = VK_BLEND_FACTOR_ONE,
+            [RA_BLEND_SRC_ALPHA]           = VK_BLEND_FACTOR_SRC_ALPHA,
+            [RA_BLEND_ONE_MINUS_SRC_ALPHA] = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA,
+        };
+
+        VkGraphicsPipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO,
+            .stageCount = 2,
+            .pStages = (VkPipelineShaderStageCreateInfo[]) {
+                {
+                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                    .stage = VK_SHADER_STAGE_VERTEX_BIT,
+                    .module = vert_shader,
+                    .pName = "main",
+                }, {
+                    .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                    .stage = VK_SHADER_STAGE_FRAGMENT_BIT,
+                    .module = frag_shader,
+                    .pName = "main",
+                }
+            },
+            .pVertexInputState = &(VkPipelineVertexInputStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
+                .vertexBindingDescriptionCount = 1,
+                .pVertexBindingDescriptions = &(VkVertexInputBindingDescription) {
+                    .binding = 0,
+                    .stride = params->vertex_stride,
+                    .inputRate = VK_VERTEX_INPUT_RATE_VERTEX,
+                },
+                .vertexAttributeDescriptionCount = params->num_vertex_attribs,
+                .pVertexAttributeDescriptions = attrs,
+            },
+            .pInputAssemblyState = &(VkPipelineInputAssemblyStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO,
+                .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST,
+            },
+            .pViewportState = &(VkPipelineViewportStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO,
+                .viewportCount = 1,
+                .scissorCount = 1,
+            },
+            .pRasterizationState = &(VkPipelineRasterizationStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO,
+                .polygonMode = VK_POLYGON_MODE_FILL,
+                .cullMode = VK_CULL_MODE_NONE,
+                .lineWidth = 1.0f,
+            },
+            .pMultisampleState = &(VkPipelineMultisampleStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO,
+                .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT,
+            },
+            .pColorBlendState = &(VkPipelineColorBlendStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO,
+                .attachmentCount = 1,
+                .pAttachments = &(VkPipelineColorBlendAttachmentState) {
+                    .blendEnable = params->enable_blend,
+                    .colorBlendOp = VK_BLEND_OP_ADD,
+                    .srcColorBlendFactor = blendFactors[params->blend_src_rgb],
+                    .dstColorBlendFactor = blendFactors[params->blend_dst_rgb],
+                    .alphaBlendOp = VK_BLEND_OP_ADD,
+                    .srcAlphaBlendFactor = blendFactors[params->blend_src_alpha],
+                    .dstAlphaBlendFactor = blendFactors[params->blend_dst_alpha],
+                    .colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+                                      VK_COLOR_COMPONENT_G_BIT |
+                                      VK_COLOR_COMPONENT_B_BIT |
+                                      VK_COLOR_COMPONENT_A_BIT,
+                },
+            },
+            .pDynamicState = &(VkPipelineDynamicStateCreateInfo) {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO,
+                .dynamicStateCount = 2,
+                .pDynamicStates = (VkDynamicState[]){
+                    VK_DYNAMIC_STATE_VIEWPORT,
+                    VK_DYNAMIC_STATE_SCISSOR,
+                },
+            },
+            .layout = pass_vk->pipeLayout,
+            .renderPass = pass_vk->renderPass,
+        };
+
+        VK(vkCreateGraphicsPipelines(vk->dev, pipeCache, 1, &cinfo,
+                                     MPVK_ALLOCATOR, &pass_vk->pipe));
+        break;
+    }
+    case RA_RENDERPASS_TYPE_COMPUTE: {
+        sinfo.pCode = (uint32_t *)comp.start;
+        sinfo.codeSize = comp.len;
+        VK(vkCreateShaderModule(vk->dev, &sinfo, MPVK_ALLOCATOR, &comp_shader));
+
+        VkComputePipelineCreateInfo cinfo = {
+            .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
+            .stage = {
+                .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
+                .stage = VK_SHADER_STAGE_COMPUTE_BIT,
+                .module = comp_shader,
+                .pName = "main",
+            },
+            .layout = pass_vk->pipeLayout,
+        };
+
+        VK(vkCreateComputePipelines(vk->dev, pipeCache, 1, &cinfo,
+                                    MPVK_ALLOCATOR, &pass_vk->pipe));
+        break;
+    }
+    }
+
+    // Update params->cached_program
+    struct bstr cache = {0};
+    VK(vkGetPipelineCacheData(vk->dev, pipeCache, &cache.len, NULL));
+    cache.start = talloc_size(tmp, cache.len);
+    VK(vkGetPipelineCacheData(vk->dev, pipeCache, &cache.len, cache.start));
+
+    struct vk_cache_header header = {
+        .cache_version = vk_cache_version,
+        .compiler_version = vk->spirv->compiler_version,
+        .vert_spirv_len = vert.len,
+        .frag_spirv_len = frag.len,
+        .comp_spirv_len = comp.len,
+        .pipecache_len = cache.len,
+    };
+
+    for (int i = 0; i < MP_ARRAY_SIZE(header.magic); i++)
+        header.magic[i] = vk_cache_magic[i];
+    for (int i = 0; i < sizeof(vk->spirv->name); i++)
+        header.compiler[i] = vk->spirv->name[i];
+
+    struct bstr *prog = &pass->params.cached_program;
+    bstr_xappend(pass, prog, (struct bstr){ (char *) &header, sizeof(header) });
+    bstr_xappend(pass, prog, vert);
+    bstr_xappend(pass, prog, frag);
+    bstr_xappend(pass, prog, comp);
+    bstr_xappend(pass, prog, cache);
+
+    success = true;
+
+error:
+    if (!success) {
+        vk_renderpass_destroy(ra, pass);
+        pass = NULL;
+    }
+
+    vkDestroyShaderModule(vk->dev, vert_shader, MPVK_ALLOCATOR);
+    vkDestroyShaderModule(vk->dev, frag_shader, MPVK_ALLOCATOR);
+    vkDestroyShaderModule(vk->dev, comp_shader, MPVK_ALLOCATOR);
+    vkDestroyPipelineCache(vk->dev, pipeCache, MPVK_ALLOCATOR);
+    talloc_free(tmp);
+    return pass;
+}
+
+static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd,
+                                 struct ra_renderpass *pass,
+                                 struct ra_renderpass_input_val val,
+                                 VkDescriptorSet ds, int idx)
+{
+    struct ra_renderpass_vk *pass_vk = pass->priv;
+    struct ra_renderpass_input *inp = &pass->params.inputs[val.index];
+
+    VkWriteDescriptorSet *wds = &pass_vk->dswrite[idx];
+    *wds = (VkWriteDescriptorSet) {
+        .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
+        .dstSet = ds,
+        .dstBinding = inp->binding,
+        .descriptorCount = 1,
+        .descriptorType = dsType[inp->type],
+    };
+
+    static const VkPipelineStageFlags passStages[] = {
+        [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+        [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+    };
+
+    switch (inp->type) {
+    case RA_VARTYPE_TEX: {
+        struct ra_tex *tex = *(struct ra_tex **)val.data;
+        struct ra_tex_vk *tex_vk = tex->priv;
+
+        assert(tex->params.render_src);
+        tex_barrier(cmd, tex_vk, passStages[pass->params.type],
+                    VK_ACCESS_SHADER_READ_BIT,
+                    VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .sampler = tex_vk->sampler,
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->current_layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        break;
+    }
+    case RA_VARTYPE_IMG_W: {
+        struct ra_tex *tex = *(struct ra_tex **)val.data;
+        struct ra_tex_vk *tex_vk = tex->priv;
+
+        assert(tex->params.storage_dst);
+        tex_barrier(cmd, tex_vk, passStages[pass->params.type],
+                    VK_ACCESS_SHADER_WRITE_BIT,
+                    VK_IMAGE_LAYOUT_GENERAL, false);
+
+        VkDescriptorImageInfo *iinfo = &pass_vk->dsiinfo[idx];
+        *iinfo = (VkDescriptorImageInfo) {
+            .imageView = tex_vk->view,
+            .imageLayout = tex_vk->current_layout,
+        };
+
+        wds->pImageInfo = iinfo;
+        break;
+    }
+    case RA_VARTYPE_BUF_RO:
+    case RA_VARTYPE_BUF_RW: {
+        struct ra_buf *buf = *(struct ra_buf **)val.data;
+        struct ra_buf_vk *buf_vk = buf->priv;
+
+        VkBufferUsageFlags access = VK_ACCESS_SHADER_READ_BIT;
+        if (inp->type == RA_VARTYPE_BUF_RW)
+            access |= VK_ACCESS_SHADER_WRITE_BIT;
+
+        buf_barrier(ra, cmd, buf, passStages[pass->params.type],
+                    access, buf_vk->slice.mem.offset, buf->params.size);
+
+        VkDescriptorBufferInfo *binfo = &pass_vk->dsbinfo[idx];
+        *binfo = (VkDescriptorBufferInfo) {
+            .buffer = buf_vk->slice.buf,
+            .offset = buf_vk->slice.mem.offset,
+            .range = buf->params.size,
+        };
+
+        wds->pBufferInfo = binfo;
+        break;
+    }
+    }
+}
+
+static void vk_renderpass_run(struct ra *ra,
+                              const struct ra_renderpass_run_params *params)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct ra_renderpass *pass = params->pass;
+    struct ra_renderpass_vk *pass_vk = pass->priv;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        goto error;
+
+    static const VkPipelineBindPoint bindPoint[] = {
+        [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_BIND_POINT_GRAPHICS,
+        [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_BIND_POINT_COMPUTE,
+    };
+
+    vkCmdBindPipeline(cmd->buf, bindPoint[pass->params.type], pass_vk->pipe);
+
+    VkDescriptorSet ds = pass_vk->dss[pass_vk->dindex++];
+    pass_vk->dindex %= MPVK_NUM_DS;
+
+    for (int i = 0; i < params->num_values; i++)
+        vk_update_descriptor(ra, cmd, pass, params->values[i], ds, i);
+
+    if (params->num_values > 0) {
+        vkUpdateDescriptorSets(vk->dev, params->num_values, pass_vk->dswrite,
+                               0, NULL);
+    }
+
+    vkCmdBindDescriptorSets(cmd->buf, bindPoint[pass->params.type],
+                            pass_vk->pipeLayout, 0, 1, &ds, 0, NULL);
+
+    if (pass->params.push_constants_size) {
+        vkCmdPushConstants(cmd->buf, pass_vk->pipeLayout,
+                           stageFlags[pass->params.type], 0,
+                           pass->params.push_constants_size,
+                           params->push_constants);
+    }
+
+    switch (pass->params.type) {
+    case RA_RENDERPASS_TYPE_COMPUTE:
+        vkCmdDispatch(cmd->buf, params->compute_groups[0],
+                      params->compute_groups[1],
+                      params->compute_groups[2]);
+        break;
+    case RA_RENDERPASS_TYPE_RASTER: {
+        struct ra_tex *tex = params->target;
+        struct ra_tex_vk *tex_vk = tex->priv;
+        assert(tex->params.render_dst);
+
+        struct ra_buf_params buf_params = {
+            .type = RA_BUF_TYPE_VERTEX,
+            .size = params->vertex_count * pass->params.vertex_stride,
+            .host_mutable = true,
+        };
+
+        struct ra_buf *buf = ra_buf_pool_get(ra, &pass_vk->vbo, &buf_params);
+        if (!buf) {
+            MP_ERR(ra, "Failed allocating vertex buffer!\n");
+            goto error;
+        }
+        struct ra_buf_vk *buf_vk = buf->priv;
+
+        vk_buf_update(ra, buf, 0, params->vertex_data, buf_params.size);
+
+        buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
+                    VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
+                    buf_vk->slice.mem.offset, buf_params.size);
+
+        vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf,
+                               &buf_vk->slice.mem.offset);
+
+        if (pass->params.enable_blend) {
+            // Normally this transition is handled implicitly by the renderpass,
+            // but if we need to preserve the FBO we have to do it manually.
+            tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+                        VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
+                        VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, false);
+        }
+
+        VkViewport viewport = {
+            .x = params->viewport.x0,
+            .y = params->viewport.y0,
+            .width  = mp_rect_w(params->viewport),
+            .height = mp_rect_h(params->viewport),
+        };
+
+        VkRect2D scissor = {
+            .offset = {params->scissors.x0, params->scissors.y0},
+            .extent = {mp_rect_w(params->scissors), mp_rect_h(params->scissors)},
+        };
+
+        vkCmdSetViewport(cmd->buf, 0, 1, &viewport);
+        vkCmdSetScissor(cmd->buf, 0, 1, &scissor);
+
+        VkRenderPassBeginInfo binfo = {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+            .renderPass = pass_vk->renderPass,
+            .framebuffer = tex_vk->framebuffer,
+            .renderArea = (VkRect2D){{0, 0}, {tex->params.w, tex->params.h}},
+        };
+
+        vkCmdBeginRenderPass(cmd->buf, &binfo, VK_SUBPASS_CONTENTS_INLINE);
+        vkCmdDraw(cmd->buf, params->vertex_count, 1, 0, 0);
+        vkCmdEndRenderPass(cmd->buf);
+
+        // The renderPass implicitly transitions the texture to this layout
+        tex_vk->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+        tex_vk->current_access = VK_ACCESS_SHADER_READ_BIT;
+        tex_vk->current_stage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+        break;
+    }
+    default: abort();
+    };
+
+error:
+    return;
+}
+
+static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
+                    struct mp_rect *dst_rc, struct mp_rect *src_rc)
+{
+    assert(src->params.blit_src);
+    assert(dst->params.blit_dst);
+
+    struct ra_tex_vk *src_vk = src->priv;
+    struct ra_tex_vk *dst_vk = dst->priv;
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        return;
+
+    tex_barrier(cmd, src_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_READ_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
+                false);
+
+    bool discard = dst_rc->x0 == 0 &&
+                   dst_rc->y0 == 0 &&
+                   dst_rc->x1 == dst->params.w &&
+                   dst_rc->y1 == dst->params.h;
+
+    tex_barrier(cmd, dst_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                VK_ACCESS_TRANSFER_WRITE_BIT,
+                VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                discard);
+
+    VkImageBlit region = {
+        .srcSubresource = vk_layers,
+        .srcOffsets = {{src_rc->x0, src_rc->y0, 0}, {src_rc->x1, src_rc->y1, 1}},
+        .dstSubresource = vk_layers,
+        .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0}, {dst_rc->x1, dst_rc->y1, 1}},
+    };
+
+    vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout, dst_vk->img,
+                   dst_vk->current_layout, 1, &region, VK_FILTER_NEAREST);
+}
+
+static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
+                     struct mp_rect *rc)
+{
+    struct ra_vk *p = ra->priv;
+    struct ra_tex_vk *tex_vk = tex->priv;
+    assert(tex->params.blit_dst);
+
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        return;
+
+    struct mp_rect full = {0, 0, tex->params.w, tex->params.h};
+    if (!rc || mp_rect_equals(rc, &full)) {
+        // To clear the entire image, we can use the efficient clear command
+        tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+                    VK_ACCESS_TRANSFER_WRITE_BIT,
+                    VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true);
+
+        VkClearColorValue clearColor = {0};
+        for (int c = 0; c < 4; c++)
+            clearColor.float32[c] = color[c];
+
+        vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout,
+                             &clearColor, 1, &vk_range);
+    } else {
+        // To simulate per-region clearing, we blit from a 1x1 texture instead
+        struct ra_tex_upload_params ul_params = {
+            .tex = p->clear_tex,
+            .invalidate = true,
+            .src = &color[0],
+        };
+        vk_tex_upload(ra, &ul_params);
+        vk_blit(ra, tex, p->clear_tex, rc, &(struct mp_rect){0, 0, 1, 1});
+    }
+}
+
+static int vk_desc_namespace(enum ra_vartype type)
+{
+    return 0;
+}
+
+#define VK_QUERY_POOL_SIZE (MPVK_MAX_STREAMING_DEPTH * 4)
+
+struct vk_timer {
+    VkQueryPool pool;
+    int index;
+    uint64_t result;
+};
+
+static void vk_timer_destroy(struct ra *ra, ra_timer *ratimer)
+{
+    if (!ratimer)
+        return;
+
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct vk_timer *timer = ratimer;
+
+    vkDestroyQueryPool(vk->dev, timer->pool, MPVK_ALLOCATOR);
+
+    talloc_free(timer);
+}
+
+MAKE_LAZY_DESTRUCTOR(vk_timer_destroy, ra_timer);
+
+static ra_timer *vk_timer_create(struct ra *ra)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+
+    struct vk_timer *timer = talloc_zero(NULL, struct vk_timer);
+
+    struct VkQueryPoolCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+        .queryType = VK_QUERY_TYPE_TIMESTAMP,
+        .queryCount = VK_QUERY_POOL_SIZE,
+    };
+
+    VK(vkCreateQueryPool(vk->dev, &qinfo, MPVK_ALLOCATOR, &timer->pool));
+
+    return (ra_timer *)timer;
+
+error:
+    vk_timer_destroy(ra, timer);
+    return NULL;
+}
+
+static void vk_timer_record(struct ra *ra, VkQueryPool pool, int index,
+                            VkPipelineStageFlags stage)
+{
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        return;
+
+    vkCmdWriteTimestamp(cmd->buf, stage, pool, index);
+}
+
+static void vk_timer_start(struct ra *ra, ra_timer *ratimer)
+{
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct vk_timer *timer = ratimer;
+
+    timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE;
+
+    uint64_t out[2];
+    VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2,
+                                         sizeof(out), &out[0], sizeof(uint64_t),
+                                         VK_QUERY_RESULT_64_BIT);
+    switch (res) {
+    case VK_SUCCESS:
+        timer->result = (out[1] - out[0]) * vk->limits.timestampPeriod;
+        break;
+    case VK_NOT_READY:
+        timer->result = 0;
+        break;
+    default:
+        MP_WARN(vk, "Failed reading timer query result: %s\n", vk_err(res));
+        return;
+    };
+
+    vk_timer_record(ra, timer->pool, timer->index,
+                    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
+}
+
+static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer)
+{
+    struct vk_timer *timer = ratimer;
+    vk_timer_record(ra, timer->pool, timer->index + 1,
+                    VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
+
+    return timer->result;
+}
+
+static struct ra_fns ra_fns_vk = {
+    .destroy                = vk_destroy_ra,
+    .tex_create             = vk_tex_create,
+    .tex_destroy            = vk_tex_destroy_lazy,
+    .tex_upload             = vk_tex_upload,
+    .buf_create             = vk_buf_create,
+    .buf_destroy            = vk_buf_destroy_lazy,
+    .buf_update             = vk_buf_update,
+    .buf_poll               = vk_buf_poll,
+    .clear                  = vk_clear,
+    .blit                   = vk_blit,
+    .uniform_layout         = std140_layout,
+    .push_constant_layout   = std430_layout,
+    .desc_namespace         = vk_desc_namespace,
+    .renderpass_create      = vk_renderpass_create,
+    .renderpass_destroy     = vk_renderpass_destroy_lazy,
+    .renderpass_run         = vk_renderpass_run,
+    .timer_create           = vk_timer_create,
+    .timer_destroy          = vk_timer_destroy_lazy,
+    .timer_start            = vk_timer_start,
+    .timer_stop             = vk_timer_stop,
+};
+
+static void present_cb(void *priv, int *inflight)
+{
+    *inflight -= 1;
+}
+
+bool ra_vk_submit(struct ra *ra, struct ra_tex *tex, VkSemaphore acquired,
+                  VkSemaphore *done, int *inflight)
+{
+    struct vk_cmd *cmd = vk_require_cmd(ra);
+    if (!cmd)
+        goto error;
+
+    if (inflight) {
+        *inflight += 1;
+        vk_cmd_callback(cmd, (vk_cb)present_cb, NULL, inflight);
+    }
+
+    struct ra_tex_vk *tex_vk = tex->priv;
+    assert(tex_vk->external_img);
+    tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0,
+                VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, false);
+
+    // These are the only two stages that we use/support for actually
+    // outputting to swapchain imagechain images, so just add a dependency
+    // on both of them. In theory, we could maybe come up with some more
+    // advanced mechanism of tracking dynamic dependencies, but that seems
+    // like overkill.
+    vk_cmd_dep(cmd, acquired,
+               VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
+               VK_PIPELINE_STAGE_TRANSFER_BIT);
+
+    return vk_flush(ra, done);
+
+error:
+    return false;
+}
diff --git a/video/out/vulkan/ra_vk.h b/video/out/vulkan/ra_vk.h
new file mode 100644
index 0000000..893421b
--- /dev/null
+++ b/video/out/vulkan/ra_vk.h
@@ -0,0 +1,31 @@
+#pragma once
+
+#include "video/out/gpu/ra.h"
+
+#include "common.h"
+#include "utils.h"
+
+struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log);
+
+// Access to the VkDevice is needed for swapchain creation
+VkDevice ra_vk_get_dev(struct ra *ra);
+
+// Allocates a ra_tex that wraps a swapchain image. The contents of the image
+// will be invalidated, and access to it will only be internally synchronized.
+// So the calling could should not do anything else with the VkImage.
+struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg,
+                                        VkSwapchainCreateInfoKHR info);
+
+// This function flushes the command buffers, transitions `tex` (which must be
+// a wrapped swapchain image) into a format suitable for presentation, and
+// submits the current rendering commands. The indicated semaphore must fire
+// before the submitted command can run. If `done` is non-NULL, it will be
+// set to a semaphore that fires once the command completes. If `inflight`
+// is non-NULL, it will be incremented when the command starts and decremented
+// when it completes.
+bool ra_vk_submit(struct ra *ra, struct ra_tex *tex, VkSemaphore acquired,
+                  VkSemaphore *done, int *inflight);
+
+// May be called on a struct ra of any type. Returns NULL if the ra is not
+// a vulkan ra.
+struct mpvk_ctx *ra_vk_get(struct ra *ra);
diff --git a/video/out/vulkan/spirv_nvidia.c b/video/out/vulkan/spirv_nvidia.c
new file mode 100644
index 0000000..6cc43a5
--- /dev/null
+++ b/video/out/vulkan/spirv_nvidia.c
@@ -0,0 +1,54 @@
+#include "video/out/gpu/spirv.h"
+
+#include "common.h"
+#include "context.h"
+#include "utils.h"
+
+static bool nv_glsl_compile(struct spirv_compiler *spirv, void *tactx,
+                            enum glsl_shader type, const char *glsl,
+                            struct bstr *out_spirv)
+{
+    // The nvidia extension literally assumes your SPIRV is in fact valid GLSL
+    *out_spirv = bstr0(glsl);
+    return true;
+}
+
+static bool nv_glsl_init(struct ra_ctx *ctx)
+{
+    struct mpvk_ctx *vk = ra_vk_ctx_get(ctx);
+    if (!vk)
+        return false;
+
+    struct spirv_compiler *spv = ctx->spirv;
+    spv->required_ext = VK_NV_GLSL_SHADER_EXTENSION_NAME;
+    spv->glsl_version = 450; // impossible to query, so hard-code it..
+    spv->ra_caps = RA_CAP_NESTED_ARRAY;
+
+    // Make sure the extension is actually available, and fail gracefully
+    // if it isn't
+    VkExtensionProperties *props = NULL;
+    uint32_t extnum = 0;
+    VK(vkEnumerateDeviceExtensionProperties(vk->physd, NULL, &extnum, NULL));
+    props = talloc_array(NULL, VkExtensionProperties, extnum);
+    VK(vkEnumerateDeviceExtensionProperties(vk->physd, NULL, &extnum, props));
+
+    bool ret = true;
+    for (int e = 0; e < extnum; e++) {
+        if (strncmp(props[e].extensionName, spv->required_ext,
+                    VK_MAX_EXTENSION_NAME_SIZE) == 0)
+            goto done;
+    }
+
+error:
+    MP_VERBOSE(ctx, "Device doesn't support VK_NV_glsl_shader, skipping..\n");
+    ret = false;
+
+done:
+    talloc_free(props);
+    return ret;
+}
+
+const struct spirv_compiler_fns spirv_nvidia_builtin = {
+    .compile_glsl = nv_glsl_compile,
+    .init = nv_glsl_init,
+};
diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c
new file mode 100644
index 0000000..baf0ebc
--- /dev/null
+++ b/video/out/vulkan/utils.c
@@ -0,0 +1,729 @@
+#include <libavutil/macros.h>
+
+#include "video/out/gpu/spirv.h"
+#include "utils.h"
+#include "malloc.h"
+
+const char* vk_err(VkResult res)
+{
+    switch (res) {
+    // These are technically success codes, but include them nonetheless
+    case VK_SUCCESS:     return "VK_SUCCESS";
+    case VK_NOT_READY:   return "VK_NOT_READY";
+    case VK_TIMEOUT:     return "VK_TIMEOUT";
+    case VK_EVENT_SET:   return "VK_EVENT_SET";
+    case VK_EVENT_RESET: return "VK_EVENT_RESET";
+    case VK_INCOMPLETE:  return "VK_INCOMPLETE";
+    case VK_SUBOPTIMAL_KHR: return "VK_SUBOPTIMAL_KHR";
+
+    // Actual error codes
+    case VK_ERROR_OUT_OF_HOST_MEMORY:    return "VK_ERROR_OUT_OF_HOST_MEMORY";
+    case VK_ERROR_OUT_OF_DEVICE_MEMORY:  return "VK_ERROR_OUT_OF_DEVICE_MEMORY";
+    case VK_ERROR_INITIALIZATION_FAILED: return "VK_ERROR_INITIALIZATION_FAILED";
+    case VK_ERROR_DEVICE_LOST:           return "VK_ERROR_DEVICE_LOST";
+    case VK_ERROR_MEMORY_MAP_FAILED:     return "VK_ERROR_MEMORY_MAP_FAILED";
+    case VK_ERROR_LAYER_NOT_PRESENT:     return "VK_ERROR_LAYER_NOT_PRESENT";
+    case VK_ERROR_EXTENSION_NOT_PRESENT: return "VK_ERROR_EXTENSION_NOT_PRESENT";
+    case VK_ERROR_FEATURE_NOT_PRESENT:   return "VK_ERROR_FEATURE_NOT_PRESENT";
+    case VK_ERROR_INCOMPATIBLE_DRIVER:   return "VK_ERROR_INCOMPATIBLE_DRIVER";
+    case VK_ERROR_TOO_MANY_OBJECTS:      return "VK_ERROR_TOO_MANY_OBJECTS";
+    case VK_ERROR_FORMAT_NOT_SUPPORTED:  return "VK_ERROR_FORMAT_NOT_SUPPORTED";
+    case VK_ERROR_FRAGMENTED_POOL:       return "VK_ERROR_FRAGMENTED_POOL";
+    case VK_ERROR_INVALID_SHADER_NV:     return "VK_ERROR_INVALID_SHADER_NV";
+    case VK_ERROR_OUT_OF_DATE_KHR:       return "VK_ERROR_OUT_OF_DATE_KHR";
+    case VK_ERROR_SURFACE_LOST_KHR:      return "VK_ERROR_SURFACE_LOST_KHR";
+    }
+
+    return "Unknown error!";
+}
+
+static const char* vk_dbg_type(VkDebugReportObjectTypeEXT type)
+{
+    switch (type) {
+    case VK_DEBUG_REPORT_OBJECT_TYPE_INSTANCE_EXT:
+        return "VkInstance";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT:
+        return "VkPhysicalDevice";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT:
+        return "VkDevice";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT:
+        return "VkQueue";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SEMAPHORE_EXT:
+        return "VkSemaphore";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_BUFFER_EXT:
+        return "VkCommandBuffer";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_FENCE_EXT:
+        return "VkFence";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_MEMORY_EXT:
+        return "VkDeviceMemory";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_EXT:
+        return "VkBuffer";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT:
+        return "VkImage";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_EVENT_EXT:
+        return "VkEvent";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_QUERY_POOL_EXT:
+        return "VkQueryPool";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_BUFFER_VIEW_EXT:
+        return "VkBufferView";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_VIEW_EXT:
+        return "VkImageView";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SHADER_MODULE_EXT:
+        return "VkShaderModule";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_CACHE_EXT:
+        return "VkPipelineCache";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_LAYOUT_EXT:
+        return "VkPipelineLayout";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_RENDER_PASS_EXT:
+        return "VkRenderPass";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_PIPELINE_EXT:
+        return "VkPipeline";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_LAYOUT_EXT:
+        return "VkDescriptorSetLayout";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SAMPLER_EXT:
+        return "VkSampler";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_POOL_EXT:
+        return "VkDescriptorPool";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DESCRIPTOR_SET_EXT:
+        return "VkDescriptorSet";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_FRAMEBUFFER_EXT:
+        return "VkFramebuffer";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_COMMAND_POOL_EXT:
+        return "VkCommandPool";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SURFACE_KHR_EXT:
+        return "VkSurfaceKHR";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_SWAPCHAIN_KHR_EXT:
+        return "VkSwapchainKHR";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_DEBUG_REPORT_EXT:
+        return "VkDebugReportCallbackEXT";
+    case VK_DEBUG_REPORT_OBJECT_TYPE_UNKNOWN_EXT:
+    default:
+        return "unknown object";
+    }
+}
+
+static VkBool32 vk_dbg_callback(VkDebugReportFlagsEXT flags,
+                                VkDebugReportObjectTypeEXT objType,
+                                uint64_t obj, size_t loc, int32_t msgCode,
+                                const char *layer, const char *msg, void *priv)
+{
+    struct mpvk_ctx *vk = priv;
+    int lev = MSGL_V;
+
+    switch (flags) {
+    case VK_DEBUG_REPORT_ERROR_BIT_EXT:               lev = MSGL_ERR;   break;
+    case VK_DEBUG_REPORT_WARNING_BIT_EXT:             lev = MSGL_WARN;  break;
+    case VK_DEBUG_REPORT_INFORMATION_BIT_EXT:         lev = MSGL_TRACE; break;
+    case VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT: lev = MSGL_WARN;  break;
+    case VK_DEBUG_REPORT_DEBUG_BIT_EXT:               lev = MSGL_DEBUG; break;
+    };
+
+    MP_MSG(vk, lev, "vk [%s] %d: %s (obj 0x%llx (%s), loc 0x%zx)\n",
+           layer, (int)msgCode, msg, (unsigned long long)obj,
+           vk_dbg_type(objType), loc);
+
+    // The return value of this function determines whether the call will
+    // be explicitly aborted (to prevent GPU errors) or not. In this case,
+    // we generally want this to be on for the errors.
+    return (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT);
+}
+
+static void vk_cmdpool_uninit(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
+{
+    if (!pool)
+        return;
+
+    // also frees associated command buffers
+    vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR);
+    for (int n = 0; n < MPVK_MAX_CMDS; n++) {
+        vkDestroyFence(vk->dev, pool->cmds[n].fence, MPVK_ALLOCATOR);
+        vkDestroySemaphore(vk->dev, pool->cmds[n].done, MPVK_ALLOCATOR);
+        talloc_free(pool->cmds[n].callbacks);
+    }
+    talloc_free(pool);
+}
+
+void mpvk_uninit(struct mpvk_ctx *vk)
+{
+    if (!vk->inst)
+        return;
+
+    if (vk->dev) {
+        vk_cmdpool_uninit(vk, vk->pool);
+        vk_malloc_uninit(vk);
+        vkDestroyDevice(vk->dev, MPVK_ALLOCATOR);
+    }
+
+    if (vk->dbg) {
+        // Same deal as creating the debug callback, we need to load this
+        // first.
+        VK_LOAD_PFN(vkDestroyDebugReportCallbackEXT)
+        pfn_vkDestroyDebugReportCallbackEXT(vk->inst, vk->dbg, MPVK_ALLOCATOR);
+    }
+
+    vkDestroySurfaceKHR(vk->inst, vk->surf, MPVK_ALLOCATOR);
+    vkDestroyInstance(vk->inst, MPVK_ALLOCATOR);
+
+    *vk = (struct mpvk_ctx){0};
+}
+
+bool mpvk_instance_init(struct mpvk_ctx *vk, struct mp_log *log,
+                        const char *surf_ext_name, bool debug)
+{
+    *vk = (struct mpvk_ctx) {
+        .log = log,
+    };
+
+    VkInstanceCreateInfo info = {
+        .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
+    };
+
+    if (debug) {
+        // Enables the LunarG standard validation layer, which
+        // is a meta-layer that loads lots of other validators
+        static const char* layers[] = {
+            "VK_LAYER_LUNARG_standard_validation",
+        };
+
+        info.ppEnabledLayerNames = layers;
+        info.enabledLayerCount = MP_ARRAY_SIZE(layers);
+    }
+
+    // Enable whatever extensions were compiled in.
+    const char *extensions[] = {
+        VK_KHR_SURFACE_EXTENSION_NAME,
+        surf_ext_name,
+
+        // Extra extensions only used for debugging. These are toggled by
+        // decreasing the enabledExtensionCount, so the number needs to be
+        // synchronized with the code below.
+        VK_EXT_DEBUG_REPORT_EXTENSION_NAME,
+    };
+
+    const int debugExtensionCount = 1;
+
+    info.ppEnabledExtensionNames = extensions;
+    info.enabledExtensionCount = MP_ARRAY_SIZE(extensions);
+
+    if (!debug)
+        info.enabledExtensionCount -= debugExtensionCount;
+
+    MP_VERBOSE(vk, "Creating instance with extensions:\n");
+    for (int i = 0; i < info.enabledExtensionCount; i++)
+        MP_VERBOSE(vk, "    %s\n", info.ppEnabledExtensionNames[i]);
+
+    VkResult res = vkCreateInstance(&info, MPVK_ALLOCATOR, &vk->inst);
+    if (res != VK_SUCCESS) {
+        MP_VERBOSE(vk, "Failed creating instance: %s\n", vk_err(res));
+        return false;
+    }
+
+    if (debug) {
+        // Set up a debug callback to catch validation messages
+        VkDebugReportCallbackCreateInfoEXT dinfo = {
+            .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT,
+            .flags = VK_DEBUG_REPORT_INFORMATION_BIT_EXT |
+                     VK_DEBUG_REPORT_WARNING_BIT_EXT |
+                     VK_DEBUG_REPORT_PERFORMANCE_WARNING_BIT_EXT |
+                     VK_DEBUG_REPORT_ERROR_BIT_EXT |
+                     VK_DEBUG_REPORT_DEBUG_BIT_EXT,
+            .pfnCallback = vk_dbg_callback,
+            .pUserData = vk,
+        };
+
+        // Since this is not part of the core spec, we need to load it. This
+        // can't fail because we've already successfully created an instance
+        // with this extension enabled.
+        VK_LOAD_PFN(vkCreateDebugReportCallbackEXT)
+        pfn_vkCreateDebugReportCallbackEXT(vk->inst, &dinfo, MPVK_ALLOCATOR,
+                                           &vk->dbg);
+    }
+
+    return true;
+}
+
+#define MPVK_MAX_DEVICES 16
+
+static bool physd_supports_surface(struct mpvk_ctx *vk, VkPhysicalDevice physd)
+{
+    uint32_t qfnum;
+    vkGetPhysicalDeviceQueueFamilyProperties(physd, &qfnum, NULL);
+
+    for (int i = 0; i < qfnum; i++) {
+        VkBool32 sup;
+        VK(vkGetPhysicalDeviceSurfaceSupportKHR(physd, i, vk->surf, &sup));
+        if (sup)
+            return true;
+    }
+
+error:
+    return false;
+}
+
+bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw)
+{
+    assert(vk->surf);
+
+    MP_VERBOSE(vk, "Probing for vulkan devices:\n");
+
+    VkPhysicalDevice *devices = NULL;
+    uint32_t num = 0;
+    VK(vkEnumeratePhysicalDevices(vk->inst, &num, NULL));
+    devices = talloc_array(NULL, VkPhysicalDevice, num);
+    VK(vkEnumeratePhysicalDevices(vk->inst, &num, devices));
+
+    // Sorted by "priority". Reuses some m_opt code for convenience
+    static const struct m_opt_choice_alternatives types[] = {
+        {"discrete",   VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU},
+        {"integrated", VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU},
+        {"virtual",    VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU},
+        {"software",   VK_PHYSICAL_DEVICE_TYPE_CPU},
+        {"unknown",    VK_PHYSICAL_DEVICE_TYPE_OTHER},
+        {0}
+    };
+
+    VkPhysicalDeviceProperties props[MPVK_MAX_DEVICES];
+    for (int i = 0; i < num; i++) {
+        vkGetPhysicalDeviceProperties(devices[i], &props[i]);
+        MP_VERBOSE(vk, "    GPU %d: %s (%s)\n", i, props[i].deviceName,
+                   m_opt_choice_str(types, props[i].deviceType));
+    }
+
+    // Iterate through each type in order of decreasing preference
+    for (int t = 0; types[t].name; t++) {
+        // Disallow SW rendering unless explicitly enabled
+        if (types[t].value == VK_PHYSICAL_DEVICE_TYPE_CPU && !sw)
+            continue;
+
+        for (int i = 0; i < num; i++) {
+            VkPhysicalDeviceProperties prop = props[i];
+            if (prop.deviceType != types[t].value)
+                continue;
+            if (name && strcmp(name, prop.deviceName) != 0)
+                continue;
+            if (!physd_supports_surface(vk, devices[i]))
+                continue;
+
+            MP_VERBOSE(vk, "Chose device:\n");
+            MP_VERBOSE(vk, "    Device Name: %s\n", prop.deviceName);
+            MP_VERBOSE(vk, "    Device ID: %x:%x\n",
+                       (unsigned)prop.vendorID, (unsigned)prop.deviceID);
+            MP_VERBOSE(vk, "    Driver version: %d\n", (int)prop.driverVersion);
+            MP_VERBOSE(vk, "    API version: %d.%d.%d\n",
+                    (int)VK_VERSION_MAJOR(prop.apiVersion),
+                    (int)VK_VERSION_MINOR(prop.apiVersion),
+                    (int)VK_VERSION_PATCH(prop.apiVersion));
+            vk->physd = devices[i];
+            vk->limits = prop.limits;
+            talloc_free(devices);
+            return true;
+        }
+    }
+
+error:
+    MP_VERBOSE(vk, "Found no suitable device, giving up.\n");
+    talloc_free(devices);
+    return false;
+}
+
+bool mpvk_pick_surface_format(struct mpvk_ctx *vk)
+{
+    assert(vk->physd);
+
+    VkSurfaceFormatKHR *formats = NULL;
+    int num;
+
+    // Enumerate through the surface formats and find one that we can map to
+    // a ra_format
+    VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, NULL));
+    formats = talloc_array(NULL, VkSurfaceFormatKHR, num);
+    VK(vkGetPhysicalDeviceSurfaceFormatsKHR(vk->physd, vk->surf, &num, formats));
+
+    for (int i = 0; i < num; i++) {
+        // A value of VK_FORMAT_UNDEFINED means we can pick anything we want
+        if (formats[i].format == VK_FORMAT_UNDEFINED) {
+            vk->surf_format = (VkSurfaceFormatKHR) {
+                .colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR,
+                .format = VK_FORMAT_R16G16B16A16_UNORM,
+            };
+            break;
+        }
+
+        if (formats[i].colorSpace != VK_COLOR_SPACE_SRGB_NONLINEAR_KHR)
+            continue;
+
+        // Format whitelist, since we want only >= 8 bit _UNORM formats
+        switch (formats[i].format) {
+        case VK_FORMAT_R8G8B8_UNORM:
+        case VK_FORMAT_B8G8R8_UNORM:
+        case VK_FORMAT_R8G8B8A8_UNORM:
+        case VK_FORMAT_B8G8R8A8_UNORM:
+        case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
+        case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
+        case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
+        case VK_FORMAT_R16G16B16_UNORM:
+        case VK_FORMAT_R16G16B16A16_UNORM:
+             break; // accept
+        default: continue;
+        }
+
+        vk->surf_format = formats[i];
+        break;
+    }
+
+    talloc_free(formats);
+
+    if (!vk->surf_format.format)
+        goto error;
+
+    return true;
+
+error:
+    MP_ERR(vk, "Failed picking surface format!\n");
+    talloc_free(formats);
+    return false;
+}
+
+static bool vk_cmdpool_init(struct mpvk_ctx *vk, VkDeviceQueueCreateInfo qinfo,
+                            VkQueueFamilyProperties props,
+                            struct vk_cmdpool **out)
+{
+    struct vk_cmdpool *pool = *out = talloc_ptrtype(NULL, pool);
+    *pool = (struct vk_cmdpool) {
+        .qf = qinfo.queueFamilyIndex,
+        .props = props,
+        .qcount = qinfo.queueCount,
+    };
+
+    for (int n = 0; n < pool->qcount; n++)
+        vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]);
+
+    VkCommandPoolCreateInfo cinfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = pool->qf,
+    };
+
+    VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool));
+
+    VkCommandBufferAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = pool->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = MPVK_MAX_CMDS,
+    };
+
+    VkCommandBuffer cmdbufs[MPVK_MAX_CMDS];
+    VK(vkAllocateCommandBuffers(vk->dev, &ainfo, cmdbufs));
+
+    for (int n = 0; n < MPVK_MAX_CMDS; n++) {
+        struct vk_cmd *cmd = &pool->cmds[n];
+        cmd->pool = pool;
+        cmd->buf = cmdbufs[n];
+
+        VkFenceCreateInfo finfo = {
+            .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+            .flags = VK_FENCE_CREATE_SIGNALED_BIT,
+        };
+
+        VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence));
+
+        VkSemaphoreCreateInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
+        };
+
+        VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &cmd->done));
+    }
+
+    return true;
+
+error:
+    return false;
+}
+
+bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
+{
+    assert(vk->physd);
+    void *tmp = talloc_new(NULL);
+
+    // Enumerate the queue families and find suitable families for each task
+    int qfnum;
+    vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, NULL);
+    VkQueueFamilyProperties *qfs = talloc_array(tmp, VkQueueFamilyProperties, qfnum);
+    vkGetPhysicalDeviceQueueFamilyProperties(vk->physd, &qfnum, qfs);
+
+    MP_VERBOSE(vk, "Queue families supported by device:\n");
+
+    for (int i = 0; i < qfnum; i++) {
+        MP_VERBOSE(vk, "    QF %d: flags 0x%x num %d\n", i,
+                   (unsigned)qfs[i].queueFlags, (int)qfs[i].queueCount);
+    }
+
+    // For most of our rendering operations, we want to use one "primary" pool,
+    // so just pick the queue family with the most features.
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
+            continue;
+
+        // QF supports more features
+        if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags)
+            idx = i;
+
+        // QF supports more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+        {
+            idx = i;
+        }
+    }
+
+    // Vulkan requires at least one GRAPHICS queue, so if this fails something
+    // is horribly wrong.
+    assert(idx >= 0);
+
+    // Ensure we can actually present to the surface using this queue
+    VkBool32 sup;
+    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx, vk->surf, &sup));
+    if (!sup) {
+        MP_ERR(vk, "Queue family does not support surface presentation!\n");
+        goto error;
+    }
+
+    // Now that we know which queue families we want, we can create the logical
+    // device
+    assert(opts.queue_count <= MPVK_MAX_QUEUES);
+    static const float priorities[MPVK_MAX_QUEUES] = {0};
+    VkDeviceQueueCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = idx,
+        .queueCount = MPMIN(qfs[idx].queueCount, opts.queue_count),
+        .pQueuePriorities = priorities,
+    };
+
+    const char **exts = NULL;
+    int num_exts = 0;
+    MP_TARRAY_APPEND(tmp, exts, num_exts, VK_KHR_SWAPCHAIN_EXTENSION_NAME);
+    if (vk->spirv->required_ext)
+        MP_TARRAY_APPEND(tmp, exts, num_exts, vk->spirv->required_ext);
+
+    VkDeviceCreateInfo dinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
+        .queueCreateInfoCount = 1,
+        .pQueueCreateInfos = &qinfo,
+        .ppEnabledExtensionNames = exts,
+        .enabledExtensionCount = num_exts,
+    };
+
+    MP_VERBOSE(vk, "Creating vulkan device with extensions:\n");
+    for (int i = 0; i < num_exts; i++)
+        MP_VERBOSE(vk, "    %s\n", exts[i]);
+
+    VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev));
+
+    vk_malloc_init(vk);
+
+    // Create the vk_cmdpools and all required queues / synchronization objects
+    if (!vk_cmdpool_init(vk, qinfo, qfs[idx], &vk->pool))
+        goto error;
+
+    talloc_free(tmp);
+    return true;
+
+error:
+    MP_ERR(vk, "Failed creating logical device!\n");
+    talloc_free(tmp);
+    return false;
+}
+
+static void run_callbacks(struct mpvk_ctx *vk, struct vk_cmd *cmd)
+{
+    for (int i = 0; i < cmd->num_callbacks; i++) {
+        struct vk_callback *cb = &cmd->callbacks[i];
+        cb->run(cb->priv, cb->arg);
+        *cb = (struct vk_callback){0};
+    }
+
+    cmd->num_callbacks = 0;
+
+    // Also reset vk->last_cmd in case this was the last command to run
+    if (vk->last_cmd == cmd)
+        vk->last_cmd = NULL;
+}
+
+static void wait_for_cmds(struct mpvk_ctx *vk, struct vk_cmd cmds[], int num)
+{
+    if (!num)
+        return;
+
+    VkFence fences[MPVK_MAX_CMDS];
+    for (int i = 0; i < num; i++)
+        fences[i] = cmds[i].fence;
+
+    vkWaitForFences(vk->dev, num, fences, true, UINT64_MAX);
+
+    for (int i = 0; i < num; i++)
+        run_callbacks(vk, &cmds[i]);
+}
+
+void mpvk_pool_wait_idle(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
+{
+    if (!pool)
+        return;
+
+    int idx = pool->cindex, pidx = pool->cindex_pending;
+    if (pidx < idx) { // range doesn't wrap
+        wait_for_cmds(vk, &pool->cmds[pidx], idx - pidx);
+    } else if (pidx > idx) { // range wraps
+        wait_for_cmds(vk, &pool->cmds[pidx], MPVK_MAX_CMDS - pidx);
+        wait_for_cmds(vk, &pool->cmds[0], idx);
+    }
+    pool->cindex_pending = pool->cindex;
+}
+
+void mpvk_dev_wait_idle(struct mpvk_ctx *vk)
+{
+    mpvk_pool_wait_idle(vk, vk->pool);
+}
+
+void mpvk_pool_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
+                         uint64_t timeout)
+{
+    if (!pool)
+        return;
+
+    // If requested, hard block until at least one command completes
+    if (timeout > 0 && pool->cindex_pending != pool->cindex) {
+        vkWaitForFences(vk->dev, 1, &pool->cmds[pool->cindex_pending].fence,
+                        true, timeout);
+    }
+
+    // Lazily garbage collect the commands based on their status
+    while (pool->cindex_pending != pool->cindex) {
+        struct vk_cmd *cmd = &pool->cmds[pool->cindex_pending];
+        VkResult res = vkGetFenceStatus(vk->dev, cmd->fence);
+        if (res != VK_SUCCESS)
+            break;
+        run_callbacks(vk, cmd);
+        pool->cindex_pending++;
+        pool->cindex_pending %= MPVK_MAX_CMDS;
+    }
+}
+
+void mpvk_dev_poll_cmds(struct mpvk_ctx *vk, uint32_t timeout)
+{
+    mpvk_pool_poll_cmds(vk, vk->pool, timeout);
+}
+
+void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg)
+{
+    if (vk->last_cmd) {
+        vk_cmd_callback(vk->last_cmd, callback, p, arg);
+    } else {
+        // The device was already idle, so we can just immediately call it
+        callback(p, arg);
+    }
+}
+
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg)
+{
+    MP_TARRAY_GROW(NULL, cmd->callbacks, cmd->num_callbacks);
+    cmd->callbacks[cmd->num_callbacks++] = (struct vk_callback) {
+        .run  = callback,
+        .priv = p,
+        .arg  = arg,
+    };
+}
+
+void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep,
+                VkPipelineStageFlags depstage)
+{
+    assert(cmd->num_deps < MPVK_MAX_CMD_DEPS);
+    cmd->deps[cmd->num_deps] = dep;
+    cmd->depstages[cmd->num_deps++] = depstage;
+}
+
+struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
+{
+    // Garbage collect the cmdpool first
+    mpvk_pool_poll_cmds(vk, pool, 0);
+
+    int next = (pool->cindex + 1) % MPVK_MAX_CMDS;
+    if (next == pool->cindex_pending) {
+        MP_ERR(vk, "No free command buffers!\n");
+        goto error;
+    }
+
+    struct vk_cmd *cmd = &pool->cmds[pool->cindex];
+    pool->cindex = next;
+
+    VK(vkResetCommandBuffer(cmd->buf, 0));
+
+    VkCommandBufferBeginInfo binfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
+        .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
+    };
+
+    VK(vkBeginCommandBuffer(cmd->buf, &binfo));
+
+    return cmd;
+
+error:
+    return NULL;
+}
+
+bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done)
+{
+    VK(vkEndCommandBuffer(cmd->buf));
+
+    struct vk_cmdpool *pool = cmd->pool;
+    VkQueue queue = pool->queues[pool->qindex];
+
+    VkSubmitInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+        .commandBufferCount = 1,
+        .pCommandBuffers = &cmd->buf,
+        .waitSemaphoreCount = cmd->num_deps,
+        .pWaitSemaphores = cmd->deps,
+        .pWaitDstStageMask = cmd->depstages,
+    };
+
+    if (done) {
+        sinfo.signalSemaphoreCount = 1;
+        sinfo.pSignalSemaphores = &cmd->done;
+        *done = cmd->done;
+    }
+
+    VK(vkResetFences(vk->dev, 1, &cmd->fence));
+    VK(vkQueueSubmit(queue, 1, &sinfo, cmd->fence));
+    MP_TRACE(vk, "Submitted command on queue %p (QF %d)\n", (void *)queue,
+             pool->qf);
+
+    for (int i = 0; i < cmd->num_deps; i++)
+        cmd->deps[i] = NULL;
+    cmd->num_deps = 0;
+
+    vk->last_cmd = cmd;
+    return true;
+
+error:
+    return false;
+}
+
+void vk_cmd_cycle_queues(struct mpvk_ctx *vk)
+{
+    struct vk_cmdpool *pool = vk->pool;
+    pool->qindex = (pool->qindex + 1) % pool->qcount;
+}
+
+const VkImageSubresourceRange vk_range = {
+    .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+    .levelCount = 1,
+    .layerCount = 1,
+};
+
+const VkImageSubresourceLayers vk_layers = {
+    .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT,
+    .layerCount = 1,
+};
diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h
new file mode 100644
index 0000000..0cc8a29
--- /dev/null
+++ b/video/out/vulkan/utils.h
@@ -0,0 +1,154 @@
+#pragma once
+
+#include "video/out/vo.h"
+#include "video/out/gpu/context.h"
+#include "video/mp_image.h"
+
+#include "common.h"
+#include "formats.h"
+
+#define VK_LOAD_PFN(name) PFN_##name pfn_##name = (PFN_##name) \
+                            vkGetInstanceProcAddr(vk->inst, #name);
+
+// Return a human-readable name for various struct mpvk_ctx enums
+const char* vk_err(VkResult res);
+
+// Convenience macros to simplify a lot of common boilerplate
+#define VK_ASSERT(res, str)                               \
+    do {                                                  \
+        if (res != VK_SUCCESS) {                          \
+            MP_ERR(vk, str ": %s\n", vk_err(res));        \
+            goto error;                                   \
+        }                                                 \
+    } while (0)
+
+#define VK(cmd)                                           \
+    do {                                                  \
+        MP_TRACE(vk, #cmd "\n");                          \
+        VkResult res ## __LINE__ = (cmd);                 \
+        VK_ASSERT(res ## __LINE__, #cmd);                 \
+    } while (0)
+
+// Uninits everything in the correct order
+void mpvk_uninit(struct mpvk_ctx *vk);
+
+// Initialization functions: As a rule of thumb, these need to be called in
+// this order, followed by vk_malloc_init, followed by RA initialization, and
+// finally followed by vk_swchain initialization.
+
+// Create a vulkan instance. Returns VK_NULL_HANDLE on failure
+bool mpvk_instance_init(struct mpvk_ctx *vk, struct mp_log *log,
+                        const char *surf_ext_name, bool debug);
+
+// Generate a VkSurfaceKHR usable for video output. Returns VK_NULL_HANDLE on
+// failure. Must be called after mpvk_instance_init.
+bool mpvk_surface_init(struct vo *vo, struct mpvk_ctx *vk);
+
+// Find a suitable physical device for use with rendering and which supports
+// the surface.
+// name: only match a device with this name
+// sw: also allow software/virtual devices
+bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw);
+
+// Pick a suitable surface format that's supported by this physical device.
+bool mpvk_pick_surface_format(struct mpvk_ctx *vk);
+
+struct mpvk_device_opts {
+    int queue_count;    // number of queues to use
+};
+
+// Create a logical device and initialize the vk_cmdpools
+bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts);
+
+// Wait until all commands submitted to all queues have completed
+void mpvk_pool_wait_idle(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
+void mpvk_dev_wait_idle(struct mpvk_ctx *vk);
+
+// Wait until at least one command submitted to any queue has completed, and
+// process the callbacks. Good for event loops that need to delay until a
+// command completes. Will block at most `timeout` nanoseconds. If used with
+// 0, it only garbage collects completed commands without blocking.
+void mpvk_pool_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
+                         uint64_t timeout);
+void mpvk_dev_poll_cmds(struct mpvk_ctx *vk, uint32_t timeout);
+
+// Since lots of vulkan operations need to be done lazily once the affected
+// resources are no longer in use, provide an abstraction for tracking these.
+// In practice, these are only checked and run when submitting new commands, so
+// the actual execution may be delayed by a frame.
+typedef void (*vk_cb)(void *priv, void *arg);
+
+struct vk_callback {
+    vk_cb run;
+    void *priv;
+    void *arg; // as a convenience, you also get to pass an arg for "free"
+};
+
+// Associate a callback with the completion of all currently pending commands.
+// This will essentially run once the device is completely idle.
+void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg);
+
+#define MPVK_MAX_CMD_DEPS 8
+
+// Helper wrapper around command buffers that also track dependencies,
+// callbacks and synchronization primitives
+struct vk_cmd {
+    struct vk_cmdpool *pool; // pool it was allocated from
+    VkCommandBuffer buf;
+    VkFence fence; // the fence guards cmd buffer reuse
+    VkSemaphore done; // the semaphore signals when execution is done
+    // The semaphores represent dependencies that need to complete before
+    // this command can be executed. These are *not* owned by the vk_cmd
+    VkSemaphore deps[MPVK_MAX_CMD_DEPS];
+    VkPipelineStageFlags depstages[MPVK_MAX_CMD_DEPS];
+    int num_deps;
+    // Since VkFences are useless, we have to manually track "callbacks"
+    // to fire once the VkFence completes. These are used for multiple purposes,
+    // ranging from garbage collection (resource deallocation) to fencing.
+    struct vk_callback *callbacks;
+    int num_callbacks;
+};
+
+// Associate a callback with the completion of the current command. This
+// bool will be set to `true` once the command completes, or shortly thereafter.
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg);
+
+// Associate a dependency for the current command. This semaphore must signal
+// by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep,
+                VkPipelineStageFlags depstage);
+
+#define MPVK_MAX_QUEUES 8
+#define MPVK_MAX_CMDS 64
+
+// Command pool / queue family hybrid abstraction
+struct vk_cmdpool {
+    VkQueueFamilyProperties props;
+    uint32_t qf; // queue family index
+    VkCommandPool pool;
+    VkQueue queues[MPVK_MAX_QUEUES];
+    int qcount;
+    int qindex;
+    // Command buffers associated with this queue
+    struct vk_cmd cmds[MPVK_MAX_CMDS];
+    int cindex;
+    int cindex_pending;
+};
+
+// Fetch the next command buffer from a command pool and begin recording to it.
+// Returns NULL on failure.
+struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
+
+// Finish the currently recording command buffer and submit it for execution.
+// If `done` is not NULL, it will be set to a semaphore that will signal once
+// the command completes. (And MUST have a corresponding semaphore wait)
+// Returns whether successful.
+bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done);
+
+// Rotate the queues for each vk_cmdpool. Call this once per frame to ensure
+// good parallelism between frames when using multiple queues
+void vk_cmd_cycle_queues(struct mpvk_ctx *vk);
+
+// Predefined structs for a simple non-layered, non-mipped image
+extern const VkImageSubresourceRange vk_range;
+extern const VkImageSubresourceLayers vk_layers;
diff --git a/video/out/w32_common.c b/video/out/w32_common.c
index b93a4fd..feeae81 100644
--- a/video/out/w32_common.c
+++ b/video/out/w32_common.c
@@ -62,8 +62,12 @@ typedef enum MONITOR_DPI_TYPE {
 } MONITOR_DPI_TYPE;
 #endif
 
+#define rect_w(r) ((r).right - (r).left)
+#define rect_h(r) ((r).bottom - (r).top)
+
 struct w32_api {
     HRESULT (WINAPI *pGetDpiForMonitor)(HMONITOR, MONITOR_DPI_TYPE, UINT*, UINT*);
+    BOOL (WINAPI *pImmDisableIME)(DWORD);
 };
 
 struct vo_w32_state {
@@ -84,15 +88,8 @@ struct vo_w32_state {
     HWINEVENTHOOK parent_evt_hook;
 
     HMONITOR monitor; // Handle of the current screen
-    struct mp_rect screenrc; // Size and virtual position of the current screen
     char *color_profile; // Path of the current screen's color profile
 
-    // last non-fullscreen extends (updated only on fullscreen or on initialization)
-    int prev_width;
-    int prev_height;
-    int prev_x;
-    int prev_y;
-
     // Has the window seen a WM_DESTROY? If so, don't call DestroyWindow again.
     bool destroyed;
 
@@ -102,11 +99,10 @@ struct vo_w32_state {
     bool current_fs;
     bool toggle_fs; // whether the current fullscreen state needs to be switched
 
-    // currently known window state
-    int window_x;
-    int window_y;
-    int dw;
-    int dh;
+    RECT windowrc; // currently known window rect
+    RECT screenrc; // current screen rect
+    // last non-fullscreen rect, updated only on fullscreen or on initialization
+    RECT prev_windowrc;
 
     // video size
     uint32_t o_dwidth;
@@ -130,6 +126,9 @@ struct vo_w32_state {
     // UTF-16 decoding state for WM_CHAR and VK_PACKET
     int high_surrogate;
 
+    // Whether to fit the window on screen on next window state updating
+    bool fit_on_screen;
+
     ITaskbarList2 *taskbar_list;
     ITaskbarList3 *taskbar_list3;
     UINT tbtnCreatedMsg;
@@ -140,6 +139,7 @@ struct vo_w32_state {
     // updates on move/resize/displaychange
     double display_fps;
 
+    bool moving;
     bool snapped;
     int snap_dx;
     int snap_dy;
@@ -182,16 +182,16 @@ static LRESULT borderless_nchittest(struct vo_w32_state *w32, int x, int y)
     if (mouse.y < frame_size) {
         if (mouse.x < diagonal_width)
             return HTTOPLEFT;
-        if (mouse.x >= w32->dw - diagonal_width)
+        if (mouse.x >= rect_w(w32->windowrc) - diagonal_width)
             return HTTOPRIGHT;
         return HTTOP;
     }
 
     // Hit-test bottom border
-    if (mouse.y >= w32->dh - frame_size) {
+    if (mouse.y >= rect_h(w32->windowrc) - frame_size) {
         if (mouse.x < diagonal_width)
             return HTBOTTOMLEFT;
-        if (mouse.x >= w32->dw - diagonal_width)
+        if (mouse.x >= rect_w(w32->windowrc) - diagonal_width)
             return HTBOTTOMRIGHT;
         return HTBOTTOM;
     }
@@ -199,7 +199,7 @@ static LRESULT borderless_nchittest(struct vo_w32_state *w32, int x, int y)
     // Hit-test side borders
     if (mouse.x < frame_size)
         return HTLEFT;
-    if (mouse.x >= w32->dw - frame_size)
+    if (mouse.x >= rect_w(w32->windowrc) - frame_size)
         return HTRIGHT;
     return HTCLIENT;
 }
@@ -607,6 +607,9 @@ static void update_playback_state(struct vo_w32_state *w32)
 
 static bool snap_to_screen_edges(struct vo_w32_state *w32, RECT *rc)
 {
+    if (w32->parent || w32->current_fs || IsMaximized(w32->window))
+        return false;
+
     if (!w32->opts->snap_window) {
         w32->snapped = false;
         return false;
@@ -616,16 +619,24 @@ static bool snap_to_screen_edges(struct vo_w32_state *w32, RECT *rc)
     POINT cursor;
     if (!GetWindowRect(w32->window, &rect) || !GetCursorPos(&cursor))
         return false;
-    // Check for aero snapping
-    if ((rc->right - rc->left != rect.right - rect.left) ||
-        (rc->bottom - rc->top != rect.bottom - rect.top))
+    // Check if window is going to be aero-snapped
+    if (rect_w(*rc) != rect_w(rect) || rect_h(*rc) != rect_h(rect))
+        return false;
+
+    // Check if window has already been aero-snapped
+    WINDOWPLACEMENT wp = {0};
+    wp.length = sizeof(wp);
+    if (!GetWindowPlacement(w32->window, &wp))
+        return false;
+    RECT wr = wp.rcNormalPosition;
+    if (rect_w(*rc) != rect_w(wr) || rect_h(*rc) != rect_h(wr))
         return false;
 
     MONITORINFO mi = { .cbSize = sizeof(mi) };
     if (!GetMonitorInfoW(w32->monitor, &mi))
         return false;
     // Get the work area to let the window snap to taskbar
-    RECT wr = mi.rcWork;
+    wr = mi.rcWork;
 
     // Check for invisible borders and adjust the work area size
     RECT frame = {0};
@@ -706,15 +717,10 @@ static void update_screen_rect(struct vo_w32_state *w32)
 
     // Handle --fs-screen=all
     if (w32->current_fs && screen == -2) {
-        struct mp_rect rc = {
-            GetSystemMetrics(SM_XVIRTUALSCREEN),
-            GetSystemMetrics(SM_YVIRTUALSCREEN),
-            GetSystemMetrics(SM_CXVIRTUALSCREEN),
-            GetSystemMetrics(SM_CYVIRTUALSCREEN),
-        };
-        rc.x1 += rc.x0;
-        rc.y1 += rc.y0;
-        w32->screenrc = rc;
+        const int x = GetSystemMetrics(SM_XVIRTUALSCREEN);
+        const int y = GetSystemMetrics(SM_YVIRTUALSCREEN);
+        SetRect(&w32->screenrc, x, y, x + GetSystemMetrics(SM_CXVIRTUALSCREEN),
+                                      y + GetSystemMetrics(SM_CYVIRTUALSCREEN));
         return;
     }
 
@@ -734,10 +740,7 @@ static void update_screen_rect(struct vo_w32_state *w32)
 
     MONITORINFO mi = { .cbSize = sizeof(mi) };
     GetMonitorInfoW(mon, &mi);
-    w32->screenrc = (struct mp_rect){
-        mi.rcMonitor.left, mi.rcMonitor.top,
-        mi.rcMonitor.right, mi.rcMonitor.bottom,
-    };
+    w32->screenrc = mi.rcMonitor;
 }
 
 static DWORD update_style(struct vo_w32_state *w32, DWORD style)
@@ -754,139 +757,148 @@ static DWORD update_style(struct vo_w32_state *w32, DWORD style)
     return style;
 }
 
-// Update the window title, position, size, and border style.
-static void reinit_window_state(struct vo_w32_state *w32)
+static void update_window_style(struct vo_w32_state *w32)
 {
-    HWND layer = HWND_NOTOPMOST;
-    RECT r;
-
     if (w32->parent)
         return;
 
-    bool new_fs = w32->toggle_fs ? !w32->current_fs : w32->opts->fullscreen;
-    bool toggle_fs = w32->current_fs != new_fs;
-    w32->current_fs = new_fs;
-    w32->toggle_fs = false;
+    // SetWindowLongPtr can trigger a WM_SIZE event, so window rect
+    // has to be saved now and restored after setting the new style.
+    const RECT wr = w32->windowrc;
+    const DWORD style = GetWindowLongPtrW(w32->window, GWL_STYLE);
+    SetWindowLongPtrW(w32->window, GWL_STYLE, update_style(w32, style));
+    w32->windowrc = wr;
+}
 
-    if (w32->taskbar_list) {
-        ITaskbarList2_MarkFullscreenWindow(w32->taskbar_list,
-                                           w32->window, w32->current_fs);
+// Adjust rc size and position if its size is larger than rc2.
+// returns true if the rectangle was modified.
+static bool fit_rect(RECT *rc, RECT *rc2)
+{
+    // Calculate old size and maximum new size
+    int o_w = rect_w(*rc), o_h = rect_h(*rc);
+    int n_w = rect_w(*rc2), n_h = rect_h(*rc2);
+    if (o_w <= n_w && o_h <= n_h)
+        return false;
+
+    // Apply letterboxing
+    const float o_asp = o_w / (float)MPMAX(o_h, 1);
+    const float n_asp = n_w / (float)MPMAX(n_h, 1);
+    if (o_asp > n_asp) {
+        n_h = n_w / o_asp;
+    } else {
+        n_w = n_h * o_asp;
     }
 
-    DWORD style = update_style(w32, GetWindowLongPtrW(w32->window, GWL_STYLE));
+    // Calculate new position and save the rect
+    const int x = rc->left + o_w / 2 - n_w / 2;
+    const int y = rc->top + o_h / 2 - n_h / 2;
+    SetRect(rc, x, y, x + n_w, y + n_h);
+    return true;
+}
 
-    if (w32->opts->ontop)
-        layer = HWND_TOPMOST;
+// Adjust window size and position if its size is larger than the screen size.
+static void fit_window_on_screen(struct vo_w32_state *w32)
+{
+    if (w32->parent || w32->current_fs || IsMaximized(w32->window))
+        return;
 
-    // xxx not sure if this can trigger any unwanted messages (WM_MOVE/WM_SIZE)
-    update_screen_rect(w32);
+    RECT screen = w32->screenrc;
+    if (w32->opts->border && w32->opts->fit_border)
+        subtract_window_borders(w32->window, &screen);
 
-    int screen_w = w32->screenrc.x1 - w32->screenrc.x0;
-    int screen_h = w32->screenrc.y1 - w32->screenrc.y0;
+    if (fit_rect(&w32->windowrc, &screen)) {
+        MP_VERBOSE(w32, "adjusted window bounds: %d:%d:%d:%d\n",
+                   (int)w32->windowrc.left, (int)w32->windowrc.top,
+                   (int)rect_w(w32->windowrc), (int)rect_h(w32->windowrc));
+    }
+}
 
-    if (w32->current_fs) {
-        // Save window position and size when switching to fullscreen.
-        if (toggle_fs) {
-            w32->prev_width = w32->dw;
-            w32->prev_height = w32->dh;
-            w32->prev_x = w32->window_x;
-            w32->prev_y = w32->window_y;
-            MP_VERBOSE(w32, "save window bounds: %d:%d:%d:%d\n",
-                   w32->prev_x, w32->prev_y, w32->prev_width, w32->prev_height);
-        }
+// Calculate new fullscreen state and change window size and position.
+// returns true if fullscreen state was changed.
+static bool update_fullscreen_state(struct vo_w32_state *w32)
+{
+    if (w32->parent)
+        return false;
 
-        w32->window_x = w32->screenrc.x0;
-        w32->window_y = w32->screenrc.y0;
-        w32->dw = screen_w;
-        w32->dh = screen_h;
-    } else {
-        if (toggle_fs) {
-            // Restore window position and size when switching from fullscreen.
-            MP_VERBOSE(w32, "restore window bounds: %d:%d:%d:%d\n",
-                   w32->prev_x, w32->prev_y, w32->prev_width, w32->prev_height);
-            w32->dw = w32->prev_width;
-            w32->dh = w32->prev_height;
-            w32->window_x = w32->prev_x;
-            w32->window_y = w32->prev_y;
-        }
+    bool new_fs = w32->opts->fullscreen;
+    if (w32->toggle_fs) {
+        new_fs = !w32->current_fs;
+        w32->toggle_fs = false;
     }
 
-    r.left = w32->window_x;
-    r.right = r.left + w32->dw;
-    r.top = w32->window_y;
-    r.bottom = r.top + w32->dh;
-
-    SetWindowLongPtrW(w32->window, GWL_STYLE, style);
-
-    RECT cr = r;
-    add_window_borders(w32->window, &r);
-    // Check on client area size instead of window size on --fit-border=no
-    long o_w;
-    long o_h;
-    if( w32->opts->fit_border ) {
-        o_w = r.right - r.left;
-        o_h = r.bottom - r.top;
-    } else {
-        o_w = cr.right - cr.left;
-        o_h = cr.bottom - cr.top;
-    }
+    bool toggle_fs = w32->current_fs != new_fs;
+    w32->current_fs = new_fs;
 
-    if ( !w32->current_fs && ( o_w > screen_w || o_h > screen_h ) )
-    {
-        MP_VERBOSE(w32, "requested window size larger than the screen\n");
-        // Use the aspect of the client area, not the full window size.
-        // Basically, try to compute the maximum window size.
-        long n_w;
-        long n_h;
-        if( w32->opts->fit_border ) {
-            n_w = screen_w - (r.right - cr.right) - (cr.left - r.left);
-            n_h = screen_h - (r.bottom - cr.bottom) - (cr.top - r.top);
-        } else {
-            n_w = screen_w;
-            n_h = screen_h;
-        }
-        // Letterbox
-        double asp = (cr.right - cr.left) / (double)(cr.bottom - cr.top);
-        double s_asp = n_w / (double)n_h;
-        if (asp > s_asp) {
-            n_h = n_w / asp;
+    update_screen_rect(w32);
+
+    if (toggle_fs) {
+        RECT rc;
+        char msg[50];
+        if (w32->current_fs) {
+            // Save window rect when switching to fullscreen.
+            rc = w32->prev_windowrc = w32->windowrc;
+            sprintf(msg, "save window bounds");
         } else {
-            n_w = n_h * asp;
+            // Restore window rect when switching from fullscreen.
+            rc = w32->windowrc = w32->prev_windowrc;
+            sprintf(msg, "restore window bounds");
         }
-        // Save new size
-        w32->dw = n_w;
-        w32->dh = n_h;
-        // Get old window center
-        long o_cx = r.left + (r.right - r.left) / 2;
-        long o_cy = r.top + (r.bottom - r.top) / 2;
-        // Add window borders to the new window size
-        r = (RECT){.right = n_w, .bottom = n_h};
-        add_window_borders(w32->window, &r);
-        // Get top and left border size for client area position calculation
-        long b_top = -r.top;
-        long b_left = -r.left;
-        // Center the final window around the old window center
-        n_w = r.right - r.left;
-        n_h = r.bottom - r.top;
-        r.left = o_cx - n_w / 2;
-        r.top = o_cy - n_h / 2;
-        r.right = r.left + n_w;
-        r.bottom = r.top + n_h;
-        // Save new client area position
-        w32->window_x = r.left + b_left;
-        w32->window_y = r.top + b_top;
+        MP_VERBOSE(w32, "%s: %d:%d:%d:%d\n", msg,
+                   (int)rc.left, (int)rc.top, (int)rect_w(rc), (int)rect_h(rc));
     }
 
+    if (w32->current_fs)
+        w32->windowrc = w32->screenrc;
+
     MP_VERBOSE(w32, "reset window bounds: %d:%d:%d:%d\n",
-               (int) r.left, (int) r.top, (int)(r.right - r.left),
-               (int)(r.bottom - r.top));
+               (int)w32->windowrc.left, (int)w32->windowrc.top,
+               (int)rect_w(w32->windowrc), (int)rect_h(w32->windowrc));
+    return toggle_fs;
+}
+
+static void update_window_state(struct vo_w32_state *w32)
+{
+    if (w32->parent)
+        return;
+
+    RECT wr = w32->windowrc;
+    add_window_borders(w32->window, &wr);
 
-    SetWindowPos(w32->window, layer, r.left, r.top, r.right - r.left,
-                 r.bottom - r.top, SWP_FRAMECHANGED | SWP_SHOWWINDOW);
+    SetWindowPos(w32->window, w32->opts->ontop ? HWND_TOPMOST : HWND_NOTOPMOST,
+                 wr.left, wr.top, rect_w(wr), rect_h(wr),
+                 SWP_FRAMECHANGED | SWP_SHOWWINDOW);
+
+    // Notify the taskbar about the fullscreen state only after the window
+    // is visible, to make sure the taskbar item has already been created
+    if (w32->taskbar_list) {
+        ITaskbarList2_MarkFullscreenWindow(w32->taskbar_list,
+                                           w32->window, w32->current_fs);
+    }
 
     signal_events(w32, VO_EVENT_RESIZE);
 }
 
+static void reinit_window_state(struct vo_w32_state *w32)
+{
+    if (w32->parent)
+        return;
+
+    // The order matters: fs state should be updated prior to changing styles
+    bool toggle_fs = update_fullscreen_state(w32);
+    update_window_style(w32);
+
+    // Assume that the window has already been fit on screen before switching fs
+    if (!toggle_fs || w32->fit_on_screen) {
+        fit_window_on_screen(w32);
+        // The fullscreen state might still be active, so set the flag
+        // to fit on screen next time the window leaves the fullscreen.
+        w32->fit_on_screen = w32->current_fs;
+    }
+
+    // Show and activate the window after all window state parameters were set
+    update_window_state(w32);
+}
+
 static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam,
                                 LPARAM lParam)
 {
@@ -917,25 +929,26 @@ static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam,
         signal_events(w32, VO_EVENT_EXPOSE);
         break;
     case WM_MOVE: {
-        POINT p = {0};
-        ClientToScreen(w32->window, &p);
-        w32->window_x = p.x;
-        w32->window_y = p.y;
+        const int x = GET_X_LPARAM(lParam), y = GET_Y_LPARAM(lParam);
+        OffsetRect(&w32->windowrc, x - w32->windowrc.left,
+                                   y - w32->windowrc.top);
 
         // Window may intersect with new monitors (see VOCTRL_GET_DISPLAY_NAMES)
         signal_events(w32, VO_EVENT_WIN_STATE);
 
         update_display_info(w32);  // if we moved between monitors
-        MP_DBG(w32, "move window: %d:%d\n", w32->window_x, w32->window_y);
+        MP_DBG(w32, "move window: %d:%d\n", x, y);
         break;
     }
     case WM_MOVING: {
+        w32->moving = true;
         RECT *rc = (RECT*)lParam;
         if (snap_to_screen_edges(w32, rc))
             return TRUE;
         break;
     }
     case WM_ENTERSIZEMOVE:
+        w32->moving = true;
         if (w32->snapped) {
             // Save the cursor offset from the window borders,
             // so the player window can be unsnapped later
@@ -947,13 +960,19 @@ static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam,
             }
         }
         break;
+    case WM_EXITSIZEMOVE:
+        w32->moving = false;
+        break;
     case WM_SIZE: {
-        RECT r;
-        if (GetClientRect(w32->window, &r) && r.right > 0 && r.bottom > 0) {
-            w32->dw = r.right;
-            w32->dh = r.bottom;
+        if (w32->moving)
+            w32->snapped = false;
+
+        const int w = LOWORD(lParam), h = HIWORD(lParam);
+        if (w > 0 && h > 0) {
+            w32->windowrc.right = w32->windowrc.left + w;
+            w32->windowrc.bottom = w32->windowrc.top + h;
             signal_events(w32, VO_EVENT_RESIZE);
-            MP_VERBOSE(w32, "resize window: %d:%d\n", w32->dw, w32->dh);
+            MP_VERBOSE(w32, "resize window: %d:%d\n", w, h);
         }
 
         // Window may have been minimized or restored
@@ -971,7 +990,7 @@ static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam,
             // (subtracting the window borders)
             RECT r = *rc;
             subtract_window_borders(w32->window, &r);
-            int c_w = r.right - r.left, c_h = r.bottom - r.top;
+            int c_w = rect_w(r), c_h = rect_h(r);
             float aspect = w32->o_dwidth / (float) MPMAX(w32->o_dheight, 1);
             int d_w = c_h * aspect - c_w;
             int d_h = c_w / aspect - c_h;
@@ -988,8 +1007,7 @@ static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam,
         update_display_info(w32);
         break;
     case WM_CLOSE:
-        // Don't actually allow it to destroy the window, or whatever else it
-        // is that will make us lose WM_USER wakeups.
+        // Don't destroy the window yet to not lose wakeup events.
         mp_input_put_key(w32->input_ctx, MP_KEY_CLOSE_WIN);
         return 0;
     case WM_NCDESTROY: // Sometimes only WM_NCDESTROY is received in --wid mode
@@ -1260,56 +1278,51 @@ static void run_message_loop(struct vo_w32_state *w32)
 static void gui_thread_reconfig(void *ptr)
 {
     struct vo_w32_state *w32 = ptr;
-
     struct vo *vo = w32->vo;
 
     struct vo_win_geometry geo;
-    vo_calc_window_geometry(vo, &w32->screenrc, &geo);
+    struct mp_rect screen = { w32->screenrc.left, w32->screenrc.top,
+                              w32->screenrc.right, w32->screenrc.bottom };
+    vo_calc_window_geometry(vo, &screen, &geo);
     vo_apply_window_geometry(vo, &geo);
 
-    bool reset_size = w32->o_dwidth != vo->dwidth || w32->o_dheight != vo->dheight;
-    bool pos_init = false;
+    bool reset_size = w32->o_dwidth != vo->dwidth ||
+                      w32->o_dheight != vo->dheight;
 
     w32->o_dwidth = vo->dwidth;
     w32->o_dheight = vo->dheight;
 
-    // the desired size is ignored in wid mode, it always matches the window size.
-    if (!w32->parent) {
-        if (w32->window_bounds_initialized) {
-            // restore vo_dwidth/vo_dheight, which are reset against our will
-            // in vo_config()
-            RECT r;
-            GetClientRect(w32->window, &r);
-            vo->dwidth = r.right;
-            vo->dheight = r.bottom;
-        } else {
-            w32->window_bounds_initialized = true;
-            reset_size = true;
-            pos_init = true;
-            w32->window_x = w32->prev_x = geo.win.x0;
-            w32->window_y = w32->prev_y = geo.win.y0;
-        }
+    if (!w32->parent && !w32->window_bounds_initialized) {
+        SetRect(&w32->windowrc, geo.win.x0, geo.win.y0,
+                geo.win.x0 + vo->dwidth, geo.win.y0 + vo->dheight);
+        w32->prev_windowrc = w32->windowrc;
+        w32->window_bounds_initialized = true;
+        w32->fit_on_screen = true;
+        goto finish;
+    }
 
-        if (reset_size) {
-            w32->prev_width = vo->dwidth = w32->o_dwidth;
-            w32->prev_height = vo->dheight = w32->o_dheight;
-        }
-    } else {
+    // The rect which size is going to be modified.
+    RECT *rc = &w32->windowrc;
+
+    // The desired size always matches the window size in wid mode.
+    if (!reset_size || w32->parent) {
         RECT r;
         GetClientRect(w32->window, &r);
+        // Restore vo_dwidth and vo_dheight, which were reset in vo_config()
         vo->dwidth = r.right;
         vo->dheight = r.bottom;
+    } else {
+        if (w32->current_fs)
+            rc = &w32->prev_windowrc;
+        w32->fit_on_screen = true;
     }
 
-    // Recenter window around old position on new video size
-    // excluding the case when initial position handled by win_state.
-    if (!pos_init) {
-        w32->window_x += w32->dw / 2 - vo->dwidth / 2;
-        w32->window_y += w32->dh / 2 - vo->dheight / 2;
-    }
-    w32->dw = vo->dwidth;
-    w32->dh = vo->dheight;
+    // Save new window size and position.
+    const int x = rc->left + rect_w(*rc) / 2 - vo->dwidth / 2;
+    const int y = rc->top + rect_h(*rc) / 2 - vo->dheight / 2;
+    SetRect(rc, x, y, x + vo->dwidth, y + vo->dheight);
 
+finish:
     reinit_window_state(w32);
 }
 
@@ -1320,25 +1333,18 @@ void vo_w32_config(struct vo *vo)
     mp_dispatch_run(w32->dispatch, gui_thread_reconfig, w32);
 }
 
-static void thread_disable_ime(void)
-{
-    // Disables the IME for windows on this thread. imm32.dll must be loaded
-    // dynamically to account for machines without East Asian language support.
-    HMODULE imm32 = LoadLibraryW(L"imm32.dll");
-    if (!imm32)
-        return;
-    BOOL (WINAPI *pImmDisableIME)(DWORD) = (BOOL (WINAPI*)(DWORD))
-        GetProcAddress(imm32, "ImmDisableIME");
-    if (pImmDisableIME)
-        pImmDisableIME(0);
-    FreeLibrary(imm32);
-}
-
 static void w32_api_load(struct vo_w32_state *w32)
 {
     HMODULE shcore_dll = LoadLibraryW(L"shcore.dll");
+    // Available since Win8.1
     w32->api.pGetDpiForMonitor = !shcore_dll ? NULL :
                 (void *)GetProcAddress(shcore_dll, "GetDpiForMonitor");
+
+    // imm32.dll must be loaded dynamically
+    // to account for machines without East Asian language support
+    HMODULE imm32_dll = LoadLibraryW(L"imm32.dll");
+    w32->api.pImmDisableIME = !imm32_dll ? NULL :
+                (void *)GetProcAddress(imm32_dll, "ImmDisableIME");
 }
 
 static void *gui_thread(void *ptr)
@@ -1350,7 +1356,10 @@ static void *gui_thread(void *ptr)
     mpthread_set_name("win32 window");
 
     w32_api_load(w32);
-    thread_disable_ime();
+
+    // Disables the IME for windows on this thread
+    if (w32->api.pImmDisableIME)
+        w32->api.pImmDisableIME(0);
 
     if (w32->opts->WinID >= 0)
         w32->parent = (HWND)(intptr_t)(w32->opts->WinID);
@@ -1423,6 +1432,9 @@ static void *gui_thread(void *ptr)
         EnableWindow(w32->window, 0);
 
     w32->cursor_visible = true;
+    w32->moving = false;
+    w32->snapped = false;
+    w32->snap_dx = w32->snap_dy = 0;
 
     update_screen_rect(w32);
 
@@ -1544,10 +1556,11 @@ static int gui_thread_control(struct vo_w32_state *w32, int request, void *arg)
             reinit_window_state(w32);
         return VO_TRUE;
     case VOCTRL_ONTOP:
-        reinit_window_state(w32);
+        update_window_state(w32);
         return VO_TRUE;
     case VOCTRL_BORDER:
-        reinit_window_state(w32);
+        update_window_style(w32);
+        update_window_state(w32);
         return VO_TRUE;
     case VOCTRL_GET_FULLSCREEN:
         *(bool *)arg = w32->current_fs;
@@ -1558,8 +1571,9 @@ static int gui_thread_control(struct vo_w32_state *w32, int request, void *arg)
         if (!w32->window_bounds_initialized)
             return VO_FALSE;
 
-        s[0] = w32->current_fs ? w32->prev_width : w32->dw;
-        s[1] = w32->current_fs ? w32->prev_height : w32->dh;
+        RECT *rc = w32->current_fs ? &w32->prev_windowrc : &w32->windowrc;
+        s[0] = rect_w(*rc);
+        s[1] = rect_h(*rc);
         return VO_TRUE;
     }
     case VOCTRL_SET_UNFS_WINDOW_SIZE: {
@@ -1567,18 +1581,13 @@ static int gui_thread_control(struct vo_w32_state *w32, int request, void *arg)
 
         if (!w32->window_bounds_initialized)
             return VO_FALSE;
-        if (w32->current_fs) {
-            w32->prev_x += w32->prev_width / 2 - s[0] / 2;
-            w32->prev_y += w32->prev_height / 2 - s[1] / 2;
-            w32->prev_width = s[0];
-            w32->prev_height = s[1];
-        } else {
-            w32->window_x += w32->dw / 2 - s[0] / 2;
-            w32->window_y += w32->dh / 2 - s[1] / 2;
-            w32->dw = s[0];
-            w32->dh = s[1];
-        }
 
+        RECT *rc = w32->current_fs ? &w32->prev_windowrc : &w32->windowrc;
+        const int x = rc->left + rect_w(*rc) / 2 - s[0] / 2;
+        const int y = rc->top + rect_h(*rc) / 2 - s[1] / 2;
+        SetRect(rc, x, y, x + s[0], y + s[1]);
+
+        w32->fit_on_screen = true;
         reinit_window_state(w32);
         return VO_TRUE;
     }
@@ -1648,8 +1657,8 @@ static void do_control(void *ptr)
     *events |= atomic_fetch_and(&w32->event_flags, 0);
     // Safe access, since caller (owner of vo) is blocked.
     if (*events & VO_EVENT_RESIZE) {
-        w32->vo->dwidth = w32->dw;
-        w32->vo->dheight = w32->dh;
+        w32->vo->dwidth = rect_w(w32->windowrc);
+        w32->vo->dheight = rect_h(w32->windowrc);
     }
 }
 
@@ -1660,8 +1669,8 @@ int vo_w32_control(struct vo *vo, int *events, int request, void *arg)
         *events |= atomic_fetch_and(&w32->event_flags, 0);
         if (*events & VO_EVENT_RESIZE) {
             mp_dispatch_lock(w32->dispatch);
-            vo->dwidth = w32->dw;
-            vo->dheight = w32->dh;
+            vo->dwidth = rect_w(w32->windowrc);
+            vo->dheight = rect_h(w32->windowrc);
             mp_dispatch_unlock(w32->dispatch);
         }
         return VO_TRUE;
diff --git a/video/out/wayland/buffer.c b/video/out/wayland/buffer.c
deleted file mode 100644
index dce3ca4..0000000
--- a/video/out/wayland/buffer.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * This file is part of mpv video player.
- * Copyright © 2014 Alexander Preisinger <alexander.preisinger@gmail.com>
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "buffer.h"
-#include "memfile.h"
-
-#include <unistd.h>
-#include <sys/mman.h>
-
-int8_t format_get_bytes(const format_t *fmt)
-{
-    return mp_imgfmt_get_desc(fmt->mp_format).bytes[0];
-}
-
-shm_buffer_t* shm_buffer_create(uint32_t width,
-                                uint32_t height,
-                                format_t fmt,
-                                struct wl_shm *shm,
-                                const struct wl_buffer_listener *listener)
-{
-    int8_t bytes = format_get_bytes(&fmt);
-    uint32_t stride = SHM_BUFFER_STRIDE(width, bytes);
-    uint32_t size = stride * height;
-
-    shm_buffer_t *buffer = calloc(1, sizeof(shm_buffer_t));
-    int fd = memfile_create(size);
-
-    if (fd < 0) {
-        free(buffer);
-        return NULL;
-    }
-
-    buffer->data = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-
-    if (buffer->data == MAP_FAILED) {
-        close(fd);
-        free(buffer);
-        return NULL;
-    }
-
-    buffer->shm_pool = wl_shm_create_pool(shm, fd, size);
-    buffer->buffer = wl_shm_pool_create_buffer(buffer->shm_pool,
-                                               0, width, height, stride,
-                                               fmt.wl_format);
-
-    wl_buffer_add_listener(buffer->buffer, listener, buffer);
-
-    buffer->fd = fd;
-    buffer->height = height;
-    buffer->stride = stride;
-    buffer->format = fmt;
-    buffer->bytes = bytes;
-    buffer->pool_size = size;
-    buffer->pending_height = 0;
-    buffer->pending_width = 0;
-
-    return buffer;
-}
-
-int shm_buffer_resize(shm_buffer_t *buffer, uint32_t width, uint32_t height)
-{
-    uint32_t new_stride = SHM_BUFFER_STRIDE(width, buffer->bytes);
-    uint32_t new_size = new_stride * height;
-
-    if (SHM_BUFFER_IS_BUSY(buffer)) {
-        SHM_BUFFER_SET_PNDNG_RSZ(buffer);
-        buffer->pending_width = width;
-        buffer->pending_height = height;
-        return SHM_BUFFER_BUSY;
-    }
-
-    SHM_BUFFER_CLEAR_PNDNG_RSZ(buffer);
-
-    if (new_size > buffer->pool_size) {
-        munmap(buffer->data, buffer->pool_size);
-        ftruncate(buffer->fd, new_size);
-
-        buffer->data = mmap(NULL, new_size, PROT_READ | PROT_WRITE,
-                            MAP_SHARED, buffer->fd, 0);
-
-        // TODO: the buffer should be destroyed when -1 is return
-        if (buffer->data == MAP_FAILED)
-            return -1;
-
-        wl_shm_pool_resize(buffer->shm_pool, new_size);
-        buffer->pool_size = new_size;
-    }
-
-    const void *listener = wl_proxy_get_listener((struct wl_proxy*)buffer->buffer);
-
-    wl_buffer_destroy(buffer->buffer);
-    buffer->buffer = wl_shm_pool_create_buffer(buffer->shm_pool,
-                                               0, width, height, new_stride,
-                                               buffer->format.wl_format);
-
-    wl_buffer_add_listener(buffer->buffer, listener, buffer);
-
-    buffer->height = height;
-    buffer->stride = new_stride;
-
-    return 0;
-}
-
-int shm_buffer_pending_resize(shm_buffer_t *buffer)
-{
-    if (SHM_BUFFER_PENDING_RESIZE(buffer)) {
-        SHM_BUFFER_CLEAR_PNDNG_RSZ(buffer);
-        return shm_buffer_resize(buffer, buffer->pending_width, buffer->pending_height);
-    }
-    else {
-        return 0;
-    }
-}
-
-void shm_buffer_destroy(shm_buffer_t *buffer)
-{
-    if (!buffer)
-        return;
-
-    wl_buffer_destroy(buffer->buffer);
-    wl_shm_pool_destroy(buffer->shm_pool);
-    munmap(buffer->data, buffer->pool_size);
-    close(buffer->fd);
-    free(buffer);
-}
diff --git a/video/out/wayland/buffer.h b/video/out/wayland/buffer.h
deleted file mode 100644
index 783cd10..0000000
--- a/video/out/wayland/buffer.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * This file is part of mpv video player.
- * Copyright © 2014 Alexander Preisinger <alexander.preisinger@gmail.com>
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MPLAYER_WAYLAND_BUFFER_H
-#define MPLAYER_WAYLAND_BUFFER_H
-
-#include <libavutil/common.h>
-#include "video/sws_utils.h"
-#include "video/img_format.h"
-#include "video/out/wayland_common.h"
-
-#define SHM_BUFFER_STRIDE(width, bytes) \
-    FFALIGN((width) * (bytes), SWS_MIN_BYTE_ALIGN)
-
-typedef struct format {
-    enum wl_shm_format wl_format;
-    enum mp_imgfmt     mp_format;
-} format_t;
-
-int8_t format_get_bytes(const format_t *fmt);
-
-typedef enum shm_buffer_flags {
-    SHM_BUFFER_BUSY         = 1 << 0, // in use by the compositor
-    SHM_BUFFER_DIRTY        = 1 << 1, // buffer contains new content
-    SHM_BUFFER_ONESHOT      = 1 << 2, // free after release
-    SHM_BUFFER_RESIZE_LATER = 1 << 3, // free after release
-} shm_buffer_flags_t;
-
-#define SHM_BUFFER_IS_BUSY(b)        (!!((b)->flags & SHM_BUFFER_BUSY))
-#define SHM_BUFFER_IS_DIRTY(b)       (!!((b)->flags & SHM_BUFFER_DIRTY))
-#define SHM_BUFFER_IS_ONESHOT(b)     (!!((b)->flags & SHM_BUFFER_ONESHOT))
-#define SHM_BUFFER_PENDING_RESIZE(b) (!!((b)->flags & SHM_BUFFER_RESIZE_LATER))
-
-#define SHM_BUFFER_SET_BUSY(b)      (b)->flags |= SHM_BUFFER_BUSY
-#define SHM_BUFFER_SET_DIRTY(b)     (b)->flags |= SHM_BUFFER_DIRTY
-#define SHM_BUFFER_SET_ONESHOT(b)   (b)->flags |= SHM_BUFFER_ONESHOT
-#define SHM_BUFFER_SET_PNDNG_RSZ(b) (b)->flags |= SHM_BUFFER_RESIZE_LATER
-
-#define SHM_BUFFER_CLEAR_BUSY(b)      (b)->flags &= ~SHM_BUFFER_BUSY
-#define SHM_BUFFER_CLEAR_DIRTY(b)     (b)->flags &= ~SHM_BUFFER_DIRTY
-#define SHM_BUFFER_CLEAR_ONESHOT(b)   (b)->flags &= ~SHM_BUFFER_ONESHOT
-#define SHM_BUFFER_CLEAR_PNDNG_RSZ(b) (b)->flags &= ~SHM_BUFFER_RESIZE_LATER
-
-typedef struct buffer {
-    struct wl_buffer *buffer;
-
-    int flags;
-
-    uint32_t height;
-    uint32_t stride;
-    uint32_t bytes; // bytes per pixel
-    // width = stride / bytes per pixel
-    // size = stride * height
-
-    struct wl_shm_pool *shm_pool; // for growing buffers;
-
-    int fd;
-    void *data;
-    uint32_t pool_size; // size of pool and data XXX
-                        // pool_size can be far bigger than the buffer size
-
-    format_t format;
-
-    uint32_t pending_height;
-    uint32_t pending_width;
-} shm_buffer_t;
-
-shm_buffer_t* shm_buffer_create(uint32_t width,
-                                uint32_t height,
-                                format_t fmt,
-                                struct wl_shm *shm,
-                                const struct wl_buffer_listener *listener);
-
-// shm pool is only able to grow and won't shrink
-// returns 0 on success or buffer flags indicating the buffer status which
-// prevent it from resizing
-int shm_buffer_resize(shm_buffer_t *buffer, uint32_t width, uint32_t height);
-
-// if shm_buffer_resize returns SHM_BUFFER_BUSY this function can be called
-// after the buffer is released to resize it afterwards
-// returns 0 if no pending resize flag was set and -1 on errors
-int shm_buffer_pending_resize(shm_buffer_t *buffer);
-
-// buffer is freed, don't use the buffer after calling this function on it
-void shm_buffer_destroy(shm_buffer_t *buffer);
-
-#endif /* MPLAYER_WAYLAND_BUFFER_H */
diff --git a/video/out/wayland/memfile.c b/video/out/wayland/memfile.c
deleted file mode 100644
index f28216d..0000000
--- a/video/out/wayland/memfile.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * This file is part of mpv video player.
- * Copyright © 2014 Alexander Preisinger <alexander.preisinger@gmail.com>
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <fcntl.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-
-#include "video/out/wayland/memfile.h"
-
-/* copied from weston clients */
-static int set_cloexec_or_close(int fd)
-{
-    long flags;
-
-    if (fd == -1)
-        return -1;
-
-    if ((flags = fcntl(fd, F_GETFD)) == -1)
-        goto err;
-
-    if (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) == -1)
-        goto err;
-
-    return fd;
-
-err:
-    close(fd);
-    return -1;
-}
-
-static int create_tmpfile_cloexec(char *tmpname)
-{
-    int fd;
-
-#ifdef HAVE_MKOSTEMP
-    fd = mkostemp(tmpname, O_CLOEXEC);
-    if (fd >= 0)
-        unlink(tmpname);
-#else
-    fd = mkstemp(tmpname);
-    if (fd >= 0) {
-        fd = set_cloexec_or_close(fd);
-        unlink(tmpname);
-    }
-#endif
-
-    return fd;
-}
-
-static int os_create_anonymous_file(off_t size)
-{
-    static const char template[] = "/mpv-temp-XXXXXX";
-    const char *path;
-    char *name;
-    int fd;
-
-    path = getenv("XDG_RUNTIME_DIR");
-    if (!path) {
-        errno = ENOENT;
-        return -1;
-    }
-
-    name = malloc(strlen(path) + sizeof(template));
-    if (!name)
-        return -1;
-
-    strcpy(name, path);
-    strcat(name, template);
-
-    fd = create_tmpfile_cloexec(name);
-
-    free(name);
-
-    if (fd < 0)
-        return -1;
-
-    if (ftruncate(fd, size) < 0) {
-        close(fd);
-        return -1;
-    }
-
-    return fd;
-}
-
-int memfile_create(off_t size)
-{
-    return os_create_anonymous_file(size);
-}
diff --git a/video/out/wayland/memfile.h b/video/out/wayland/memfile.h
deleted file mode 100644
index 67cdb1b..0000000
--- a/video/out/wayland/memfile.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * This file is part of mpv video player.
- * Copyright © 2014 Alexander Preisinger <alexander.preisinger@gmail.com>
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MPLAYER_WAYLAND_MEMFILE_H
-#define MPLAYER_WAYLAND_MEMFILE_H
-
-// create file decsriptor to memory space without filesystem representation
-// truncates to size immediately
-int memfile_create(off_t size);
-
-#endif /* MPLAYER_WAYLAND_MEMFILE_H */
diff --git a/video/out/wayland/server-decoration.xml b/video/out/wayland/server-decoration.xml
new file mode 100644
index 0000000..8bc106c
--- /dev/null
+++ b/video/out/wayland/server-decoration.xml
@@ -0,0 +1,94 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<protocol name="server_decoration">
+  <copyright><![CDATA[
+    Copyright (C) 2015 Martin Gräßlin
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 2.1 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  ]]></copyright>
+  <interface  name="org_kde_kwin_server_decoration_manager" version="1">
+      <description summary="Server side window decoration manager">
+        This interface allows to coordinate whether the server should create
+        a server-side window decoration around a wl_surface representing a
+        shell surface (wl_shell_surface or similar). By announcing support
+        for this interface the server indicates that it supports server
+        side decorations.
+      </description>
+      <request name="create">
+        <description summary="Create a server-side decoration object for a given surface">
+            When a client creates a server-side decoration object it indicates
+            that it supports the protocol. The client is supposed to tell the
+            server whether it wants server-side decorations or will provide
+            client-side decorations.
+
+            If the client does not create a server-side decoration object for
+            a surface the server interprets this as lack of support for this
+            protocol and considers it as client-side decorated. Nevertheless a
+            client-side decorated surface should use this protocol to indicate
+            to the server that it does not want a server-side deco.
+        </description>
+        <arg name="id" type="new_id" interface="org_kde_kwin_server_decoration"/>
+        <arg name="surface" type="object" interface="wl_surface"/>
+      </request>
+      <enum name="mode">
+            <description summary="Possible values to use in request_mode and the event mode."/>
+            <entry name="None" value="0" summary="Undecorated: The surface is not decorated at all, neither server nor client-side. An example is a popup surface which should not be decorated."/>
+            <entry name="Client" value="1" summary="Client-side decoration: The decoration is part of the surface and the client."/>
+            <entry name="Server" value="2" summary="Server-side decoration: The server embeds the surface into a decoration frame."/>
+      </enum>
+      <event name="default_mode">
+          <description summary="The default mode used on the server">
+              This event is emitted directly after binding the interface. It contains
+              the default mode for the decoration. When a new server decoration object
+              is created this new object will be in the default mode until the first
+              request_mode is requested.
+
+              The server may change the default mode at any time.
+          </description>
+          <arg name="mode" type="uint" summary="The default decoration mode applied to newly created server decorations."/>
+      </event>
+  </interface>
+  <interface name="org_kde_kwin_server_decoration" version="1">
+      <request name="release" type="destructor">
+        <description summary="release the server decoration object"/>
+      </request>
+      <enum name="mode">
+            <description summary="Possible values to use in request_mode and the event mode."/>
+            <entry name="None" value="0" summary="Undecorated: The surface is not decorated at all, neither server nor client-side. An example is a popup surface which should not be decorated."/>
+            <entry name="Client" value="1" summary="Client-side decoration: The decoration is part of the surface and the client."/>
+            <entry name="Server" value="2" summary="Server-side decoration: The server embeds the surface into a decoration frame."/>
+      </enum>
+      <request name="request_mode">
+          <description summary="The decoration mode the surface wants to use."/>
+          <arg name="mode" type="uint" summary="The mode this surface wants to use."/>
+      </request>
+      <event name="mode">
+          <description summary="The new decoration mode applied by the server">
+              This event is emitted directly after the decoration is created and
+              represents the base decoration policy by the server. E.g. a server
+              which wants all surfaces to be client-side decorated will send Client,
+              a server which wants server-side decoration will send Server.
+
+              The client can request a different mode through the decoration request.
+              The server will acknowledge this by another event with the same mode. So
+              even if a server prefers server-side decoration it's possible to force a
+              client-side decoration.
+
+              The server may emit this event at any time. In this case the client can
+              again request a different mode. It's the responsibility of the server to
+              prevent a feedback loop.
+          </description>
+          <arg name="mode" type="uint" summary="The decoration mode applied to the surface by the server."/>
+      </event>
+  </interface>
+</protocol>
diff --git a/video/out/wayland_common.c b/video/out/wayland_common.c
index 181723a..19adf01 100644
--- a/video/out/wayland_common.c
+++ b/video/out/wayland_common.c
@@ -1,8 +1,5 @@
 /*
  * This file is part of mpv video player.
- * Copyright © 2008 Kristian Høgsberg
- * Copyright © 2012-2013 Collabora, Ltd.
- * Copyright © 2013 Alexander Preisinger <alexander.preisinger@gmail.com>
  *
  * mpv is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -18,243 +15,331 @@
  * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <inttypes.h>
-#include <limits.h>
-#include <assert.h>
 #include <poll.h>
 #include <unistd.h>
-
-#include <sys/mman.h>
 #include <linux/input.h>
-
-#include "config.h"
-#include "misc/bstr.h"
-#include "options/options.h"
 #include "common/msg.h"
-#include "mpv_talloc.h"
-
-#include "wayland_common.h"
-
-#include "vo.h"
-#include "win_state.h"
+#include "input/input.h"
+#include "input/keycodes.h"
 #include "osdep/io.h"
 #include "osdep/timer.h"
+#include "win_state.h"
+#include "wayland_common.h"
 
-#include "input/input.h"
-#include "input/event.h"
-#include "input/keycodes.h"
+// Generated from xdg-shell-unstable-v6.xml
+#include "video/out/wayland/xdg-shell-v6.h"
 
-static int lookupkey(int key);
+// Generated from idle-inhibit-unstable-v1.xml
+#include "video/out/wayland/idle-inhibit-v1.h"
 
-static void hide_cursor(struct vo_wayland_state * wl);
-static void show_cursor(struct vo_wayland_state * wl);
-static void window_move(struct vo_wayland_state * wl, uint32_t serial);
-static void window_set_title(struct vo_wayland_state * wl, const char *title);
-static void schedule_resize(struct vo_wayland_state *wl,
-                            uint32_t edges,
-                            int32_t width,
-                            int32_t height);
+// Generated from server-decoration.xml
+#include "video/out/wayland/srv-decor.h"
 
-static void vo_wayland_fullscreen(struct vo *vo);
+static void xdg_shell_ping(void *data, struct zxdg_shell_v6 *shell, uint32_t serial)
+{
+    zxdg_shell_v6_pong(shell, serial);
+}
 
-static const struct wl_callback_listener frame_listener;
+static const struct zxdg_shell_v6_listener xdg_shell_listener = {
+    xdg_shell_ping,
+};
 
-static const struct mp_keymap keymap[] = {
-    // special keys
-    {XKB_KEY_Pause,     MP_KEY_PAUSE}, {XKB_KEY_Escape, MP_KEY_ESC},
-    {XKB_KEY_BackSpace, MP_KEY_BS},    {XKB_KEY_Tab,    MP_KEY_TAB},
-    {XKB_KEY_Return,    MP_KEY_ENTER}, {XKB_KEY_Menu,   MP_KEY_MENU},
-    {XKB_KEY_Print,     MP_KEY_PRINT},
+static int spawn_cursor(struct vo_wayland_state *wl)
+{
+    if (wl->allocated_cursor_scale == wl->scaling) /* Reuse if size is identical */
+        return 0;
+    else if (wl->cursor_theme)
+        wl_cursor_theme_destroy(wl->cursor_theme);
+
+    wl->cursor_theme = wl_cursor_theme_load(NULL, 32*wl->scaling, wl->shm);
+    if (!wl->cursor_theme) {
+        MP_ERR(wl, "Unable to load cursor theme!\n");
+        return 1;
+    }
 
-    // cursor keys
-    {XKB_KEY_Left, MP_KEY_LEFT}, {XKB_KEY_Right, MP_KEY_RIGHT},
-    {XKB_KEY_Up,   MP_KEY_UP},   {XKB_KEY_Down,  MP_KEY_DOWN},
+    wl->default_cursor = wl_cursor_theme_get_cursor(wl->cursor_theme, "left_ptr");
+    if (!wl->default_cursor) {
+        MP_ERR(wl, "Unable to load cursor theme!\n");
+        return 1;
+    }
 
-    // navigation block
-    {XKB_KEY_Insert,  MP_KEY_INSERT},  {XKB_KEY_Delete,    MP_KEY_DELETE},
-    {XKB_KEY_Home,    MP_KEY_HOME},    {XKB_KEY_End,       MP_KEY_END},
-    {XKB_KEY_Page_Up, MP_KEY_PAGE_UP}, {XKB_KEY_Page_Down, MP_KEY_PAGE_DOWN},
+    wl->allocated_cursor_scale = wl->scaling;
 
-    // F-keys
-    {XKB_KEY_F1,  MP_KEY_F+1},  {XKB_KEY_F2,  MP_KEY_F+2},
-    {XKB_KEY_F3,  MP_KEY_F+3},  {XKB_KEY_F4,  MP_KEY_F+4},
-    {XKB_KEY_F5,  MP_KEY_F+5},  {XKB_KEY_F6,  MP_KEY_F+6},
-    {XKB_KEY_F7,  MP_KEY_F+7},  {XKB_KEY_F8,  MP_KEY_F+8},
-    {XKB_KEY_F9,  MP_KEY_F+9},  {XKB_KEY_F10, MP_KEY_F+10},
-    {XKB_KEY_F11, MP_KEY_F+11}, {XKB_KEY_F12, MP_KEY_F+12},
+    return 0;
+}
 
-    // numpad independent of numlock
-    {XKB_KEY_KP_Subtract, '-'}, {XKB_KEY_KP_Add, '+'},
-    {XKB_KEY_KP_Multiply, '*'}, {XKB_KEY_KP_Divide, '/'},
-    {XKB_KEY_KP_Enter, MP_KEY_KPENTER},
+static int set_cursor_visibility(struct vo_wayland_state *wl, bool on)
+{
+    if (!wl->pointer)
+        return VO_NOTAVAIL;
+    if (on) {
+        if (spawn_cursor(wl))
+            return VO_FALSE;
+        struct wl_cursor_image *img = wl->default_cursor->images[0];
+        struct wl_buffer *buffer = wl_cursor_image_get_buffer(img);
+        if (!buffer)
+            return VO_FALSE;
+        wl_pointer_set_cursor(wl->pointer, wl->pointer_id, wl->cursor_surface,
+                              img->hotspot_x/wl->scaling, img->hotspot_y/wl->scaling);
+        wl_surface_set_buffer_scale(wl->cursor_surface, wl->scaling);
+        wl_surface_attach(wl->cursor_surface, buffer, 0, 0);
+        wl_surface_damage(wl->cursor_surface, 0, 0, img->width, img->height);
+        wl_surface_commit(wl->cursor_surface);
+    } else {
+        wl_pointer_set_cursor(wl->pointer, wl->pointer_id, NULL, 0, 0);
+    }
+    return VO_TRUE;
+}
 
-    // numpad with numlock
-    {XKB_KEY_KP_0, MP_KEY_KP0}, {XKB_KEY_KP_1, MP_KEY_KP1},
-    {XKB_KEY_KP_2, MP_KEY_KP2}, {XKB_KEY_KP_3, MP_KEY_KP3},
-    {XKB_KEY_KP_4, MP_KEY_KP4}, {XKB_KEY_KP_5, MP_KEY_KP5},
-    {XKB_KEY_KP_6, MP_KEY_KP6}, {XKB_KEY_KP_7, MP_KEY_KP7},
-    {XKB_KEY_KP_8, MP_KEY_KP8}, {XKB_KEY_KP_9, MP_KEY_KP9},
-    {XKB_KEY_KP_Decimal, MP_KEY_KPDEC}, {XKB_KEY_KP_Separator, MP_KEY_KPDEC},
+static void pointer_handle_enter(void *data, struct wl_pointer *pointer,
+                                 uint32_t serial, struct wl_surface *surface,
+                                 wl_fixed_t sx, wl_fixed_t sy)
+{
+    struct vo_wayland_state *wl = data;
 
-    // numpad without numlock
-    {XKB_KEY_KP_Insert, MP_KEY_KPINS}, {XKB_KEY_KP_End,       MP_KEY_KP1},
-    {XKB_KEY_KP_Down,   MP_KEY_KP2},   {XKB_KEY_KP_Page_Down, MP_KEY_KP3},
-    {XKB_KEY_KP_Left,   MP_KEY_KP4},   {XKB_KEY_KP_Begin,     MP_KEY_KP5},
-    {XKB_KEY_KP_Right,  MP_KEY_KP6},   {XKB_KEY_KP_Home,      MP_KEY_KP7},
-    {XKB_KEY_KP_Up,     MP_KEY_KP8},   {XKB_KEY_KP_Page_Up,   MP_KEY_KP9},
-    {XKB_KEY_KP_Delete, MP_KEY_KPDEL},
+    wl->pointer    = pointer;
+    wl->pointer_id = serial;
 
-    // "Multimedia keyboard" keys
-    {XKB_KEY_XF86MenuKB, MP_KEY_MENU},
-    {XKB_KEY_XF86AudioPlay, MP_KEY_PLAY}, {XKB_KEY_XF86AudioPause, MP_KEY_PAUSE},
-    {XKB_KEY_XF86AudioStop, MP_KEY_STOP},
-    {XKB_KEY_XF86AudioPrev, MP_KEY_PREV}, {XKB_KEY_XF86AudioNext, MP_KEY_NEXT},
-    {XKB_KEY_XF86AudioRewind, MP_KEY_REWIND},
-    {XKB_KEY_XF86AudioForward, MP_KEY_FORWARD},
-    {XKB_KEY_XF86AudioMute, MP_KEY_MUTE},
-    {XKB_KEY_XF86AudioLowerVolume, MP_KEY_VOLUME_DOWN},
-    {XKB_KEY_XF86AudioRaiseVolume, MP_KEY_VOLUME_UP},
-    {XKB_KEY_XF86HomePage, MP_KEY_HOMEPAGE}, {XKB_KEY_XF86WWW, MP_KEY_WWW},
-    {XKB_KEY_XF86Mail, MP_KEY_MAIL}, {XKB_KEY_XF86Favorites, MP_KEY_FAVORITES},
-    {XKB_KEY_XF86Search, MP_KEY_SEARCH}, {XKB_KEY_XF86Sleep, MP_KEY_SLEEP},
+    set_cursor_visibility(wl, true);
+    mp_input_put_key(wl->vo->input_ctx, MP_KEY_MOUSE_ENTER);
+}
 
-    {0, 0}
-};
+static void pointer_handle_leave(void *data, struct wl_pointer *pointer,
+                                 uint32_t serial, struct wl_surface *surface)
+{
+    struct vo_wayland_state *wl = data;
+    mp_input_put_key(wl->vo->input_ctx, MP_KEY_MOUSE_LEAVE);
+}
 
+static void pointer_handle_motion(void *data, struct wl_pointer *pointer,
+                                  uint32_t time, wl_fixed_t sx, wl_fixed_t sy)
+{
+    struct vo_wayland_state *wl = data;
 
-/** Wayland listeners **/
+    wl->mouse_x = wl_fixed_to_int(sx) * wl->scaling;
+    wl->mouse_y = wl_fixed_to_int(sy) * wl->scaling;
 
-static void ssurface_handle_ping(void *data,
-                                 struct wl_shell_surface *shell_surface,
-                                 uint32_t serial)
+    mp_input_set_mouse_pos(wl->vo->input_ctx, wl->mouse_x, wl->mouse_y);
+}
+
+static void window_move(struct vo_wayland_state *wl, uint32_t serial)
 {
-    wl_shell_surface_pong(shell_surface, serial);
+    if (wl->xdg_toplevel)
+        zxdg_toplevel_v6_move(wl->xdg_toplevel, wl->seat, serial);
 }
 
-static void ssurface_handle_configure(void *data,
-                                      struct wl_shell_surface *shell_surface,
-                                      uint32_t edges,
-                                      int32_t width,
-                                      int32_t height)
+static void pointer_handle_button(void *data, struct wl_pointer *wl_pointer,
+                                  uint32_t serial, uint32_t time, uint32_t button,
+                                  uint32_t state)
 {
     struct vo_wayland_state *wl = data;
-    float win_aspect = wl->window.aspect;
-    if (!width || !height)
-        return;
-    if (!wl->window.is_fullscreen)
-        width = win_aspect * height;
-    schedule_resize(wl, edges, width, height);
+
+    state = state == WL_POINTER_BUTTON_STATE_PRESSED ? MP_KEY_STATE_DOWN
+                                                     : MP_KEY_STATE_UP;
+
+    button = button == BTN_LEFT   ? MP_MBTN_LEFT :
+             button == BTN_MIDDLE ? MP_MBTN_MID  : MP_MBTN_RIGHT;
+
+    mp_input_put_key(wl->vo->input_ctx, button | state);
+
+    if (!mp_input_test_dragging(wl->vo->input_ctx, wl->mouse_x, wl->mouse_y) &&
+        (button == MP_MBTN_LEFT) && (state == MP_KEY_STATE_DOWN))
+        window_move(wl, serial);
 }
 
-static void ssurface_handle_popup_done(void *data,
-                                       struct wl_shell_surface *shell_surface)
+static void pointer_handle_axis(void *data, struct wl_pointer *wl_pointer,
+                                uint32_t time, uint32_t axis, wl_fixed_t value)
 {
+    struct vo_wayland_state *wl = data;
+    double val = wl_fixed_to_double(value)*0.1;
+    switch (axis) {
+    case WL_POINTER_AXIS_VERTICAL_SCROLL:
+        if (value > 0)
+            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_DOWN,  +val);
+        if (value < 0)
+            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_UP,    -val);
+        break;
+    case WL_POINTER_AXIS_HORIZONTAL_SCROLL:
+        if (value > 0)
+            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_RIGHT, +val);
+        if (value < 0)
+            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_LEFT,  -val);
+        break;
+    }
 }
 
-static const struct wl_shell_surface_listener shell_surface_listener = {
-    ssurface_handle_ping,
-    ssurface_handle_configure,
-    ssurface_handle_popup_done
+static const struct wl_pointer_listener pointer_listener = {
+    pointer_handle_enter,
+    pointer_handle_leave,
+    pointer_handle_motion,
+    pointer_handle_button,
+    pointer_handle_axis,
 };
 
-static void output_handle_geometry(void *data,
-                                   struct wl_output *wl_output,
-                                   int32_t x,
-                                   int32_t y,
-                                   int32_t physical_width,
-                                   int32_t physical_height,
-                                   int32_t subpixel,
-                                   const char *make,
-                                   const char *model,
-                                   int32_t transform)
+static int check_for_resize(struct vo_wayland_state *wl, wl_fixed_t x_w, wl_fixed_t y_w,
+                            enum zxdg_toplevel_v6_resize_edge *edge)
 {
-    struct vo_wayland_output *output = data;
-    output->make = make;
-    output->model = model;
+    if (wl->touch_entries || wl->fullscreen)
+        return 0;
+
+    const int edge_pixels = 64;
+    int pos[2] = { wl_fixed_to_double(x_w), wl_fixed_to_double(y_w) };
+    int left_edge   = pos[0] < edge_pixels;
+    int top_edge    = pos[1] < edge_pixels;
+    int right_edge  = pos[0] > (mp_rect_w(wl->geometry) - edge_pixels);
+    int bottom_edge = pos[1] > (mp_rect_h(wl->geometry) - edge_pixels);
+
+    if (left_edge) {
+        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_LEFT;
+        if (top_edge)
+            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_TOP_LEFT;
+        else if (bottom_edge)
+            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_BOTTOM_LEFT;
+    } else if (right_edge) {
+        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_RIGHT;
+        if (top_edge)
+            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_TOP_RIGHT;
+        else if (bottom_edge)
+            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_BOTTOM_RIGHT;
+    } else if (top_edge) {
+        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_TOP;
+    } else if (bottom_edge) {
+        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_BOTTOM;
+    } else {
+        *edge = 0;
+        return 0;
+    }
+
+    return 1;
 }
 
-static void output_handle_mode(void *data,
-                               struct wl_output *wl_output,
-                               uint32_t flags,
-                               int32_t width,
-                               int32_t height,
-                               int32_t refresh)
+static void touch_handle_down(void *data, struct wl_touch *wl_touch,
+                              uint32_t serial, uint32_t time, struct wl_surface *surface,
+                              int32_t id, wl_fixed_t x_w, wl_fixed_t y_w)
 {
-    struct vo_wayland_output *output = data;
+    struct vo_wayland_state *wl = data;
 
-    // only save current mode
-    if (!output || !(flags & WL_OUTPUT_MODE_CURRENT))
+    enum zxdg_toplevel_v6_resize_edge edge;
+    if (check_for_resize(wl, x_w, y_w, &edge)) {
+        wl->touch_entries = 0;
+        zxdg_toplevel_v6_resize(wl->xdg_toplevel, wl->seat, serial, edge);
+        return;
+    } else if (wl->touch_entries) {
+        wl->touch_entries = 0;
+        zxdg_toplevel_v6_move(wl->xdg_toplevel, wl->seat, serial);
         return;
+    }
 
-    output->width = width;
-    output->height = height;
-    output->flags = flags;
-    output->refresh_rate = refresh;
-}
+    wl->touch_entries = 1;
 
-static void output_handle_done(void* data, struct wl_output *wl_output)
-{
-}
+    wl->mouse_x = wl_fixed_to_int(x_w) * wl->scaling;
+    wl->mouse_y = wl_fixed_to_int(y_w) * wl->scaling;
 
-static void output_handle_scale(void* data, struct wl_output *wl_output,
-                                int32_t factor)
-{
-    struct vo_wayland_output *output = data;
-    output->scale = factor;
+    mp_input_set_mouse_pos(wl->vo->input_ctx, wl->mouse_x, wl->mouse_y);
+    mp_input_put_key(wl->vo->input_ctx, MP_MBTN_LEFT | MP_KEY_STATE_DOWN);
 }
 
-static const struct wl_output_listener output_listener = {
-    output_handle_geometry,
-    output_handle_mode,
-    output_handle_done,
-    output_handle_scale
-};
+static void touch_handle_up(void *data, struct wl_touch *wl_touch,
+                            uint32_t serial, uint32_t time, int32_t id)
+{
+    struct vo_wayland_state *wl = data;
 
+    wl->touch_entries = 0;
 
-/* SURFACE LISTENER */
+    mp_input_put_key(wl->vo->input_ctx, MP_MBTN_LEFT | MP_KEY_STATE_UP);
+}
 
-static void surface_handle_enter(void *data,
-                                 struct wl_surface *wl_surface,
-                                 struct wl_output *output)
+static void touch_handle_motion(void *data, struct wl_touch *wl_touch,
+                                uint32_t time, int32_t id, wl_fixed_t x_w, wl_fixed_t y_w)
 {
     struct vo_wayland_state *wl = data;
-    wl->display.current_output = NULL;
 
-    struct vo_wayland_output *o;
-    wl_list_for_each(o, &wl->display.output_list, link) {
-        if (o->output == output) {
-            wl->display.current_output = o;
-            break;
-        }
-    }
+    wl->mouse_x = wl_fixed_to_int(x_w) * wl->scaling;
+    wl->mouse_y = wl_fixed_to_int(y_w) * wl->scaling;
 
-    wl->window.events |= VO_EVENT_WIN_STATE | VO_EVENT_RESIZE;
+    mp_input_set_mouse_pos(wl->vo->input_ctx, wl->mouse_x, wl->mouse_y);
 }
 
-static void surface_handle_leave(void *data,
-                                 struct wl_surface *wl_surface,
-                                 struct wl_output *output)
+static void touch_handle_frame(void *data, struct wl_touch *wl_touch)
 {
-    // window can be displayed at 2 output, but we only use the most recently
-    // entered and discard the previous one even if a part of the window is
-    // still visible on the previous entered output.
-    // Don't bother with a "leave" logic
 }
 
-static const struct wl_surface_listener surface_listener = {
-    surface_handle_enter,
-    surface_handle_leave
+static void touch_handle_cancel(void *data, struct wl_touch *wl_touch)
+{
+}
+
+static const struct wl_touch_listener touch_listener = {
+    touch_handle_down,
+    touch_handle_up,
+    touch_handle_motion,
+    touch_handle_frame,
+    touch_handle_cancel,
 };
 
-/* KEYBOARD LISTENER */
-static void keyboard_handle_keymap(void *data,
-                                   struct wl_keyboard *wl_keyboard,
-                                   uint32_t format,
-                                   int32_t fd,
-                                   uint32_t size)
+static const struct mp_keymap keymap[] = {
+    /* Special keys */
+    {XKB_KEY_Pause,     MP_KEY_PAUSE}, {XKB_KEY_Escape, MP_KEY_ESC},
+    {XKB_KEY_BackSpace, MP_KEY_BS},    {XKB_KEY_Tab,    MP_KEY_TAB},
+    {XKB_KEY_Return,    MP_KEY_ENTER}, {XKB_KEY_Menu,   MP_KEY_MENU},
+    {XKB_KEY_Print,     MP_KEY_PRINT},
+
+    /* Cursor keys */
+    {XKB_KEY_Left, MP_KEY_LEFT}, {XKB_KEY_Right, MP_KEY_RIGHT},
+    {XKB_KEY_Up,   MP_KEY_UP},   {XKB_KEY_Down,  MP_KEY_DOWN},
+
+    /* Navigation keys */
+    {XKB_KEY_Insert,  MP_KEY_INSERT},  {XKB_KEY_Delete,    MP_KEY_DELETE},
+    {XKB_KEY_Home,    MP_KEY_HOME},    {XKB_KEY_End,       MP_KEY_END},
+    {XKB_KEY_Page_Up, MP_KEY_PAGE_UP}, {XKB_KEY_Page_Down, MP_KEY_PAGE_DOWN},
+
+    /* F-keys */
+    {XKB_KEY_F1,  MP_KEY_F + 1},  {XKB_KEY_F2,  MP_KEY_F + 2},
+    {XKB_KEY_F3,  MP_KEY_F + 3},  {XKB_KEY_F4,  MP_KEY_F + 4},
+    {XKB_KEY_F5,  MP_KEY_F + 5},  {XKB_KEY_F6,  MP_KEY_F + 6},
+    {XKB_KEY_F7,  MP_KEY_F + 7},  {XKB_KEY_F8,  MP_KEY_F + 8},
+    {XKB_KEY_F9,  MP_KEY_F + 9},  {XKB_KEY_F10, MP_KEY_F +10},
+    {XKB_KEY_F11, MP_KEY_F +11},  {XKB_KEY_F12, MP_KEY_F +12},
+
+    /* Numpad independent of numlock */
+    {XKB_KEY_KP_Subtract, '-'}, {XKB_KEY_KP_Add,    '+'},
+    {XKB_KEY_KP_Multiply, '*'}, {XKB_KEY_KP_Divide, '/'},
+    {XKB_KEY_KP_Enter, MP_KEY_KPENTER},
+
+    /* Numpad with numlock */
+    {XKB_KEY_KP_0, MP_KEY_KP0}, {XKB_KEY_KP_1, MP_KEY_KP1},
+    {XKB_KEY_KP_2, MP_KEY_KP2}, {XKB_KEY_KP_3, MP_KEY_KP3},
+    {XKB_KEY_KP_4, MP_KEY_KP4}, {XKB_KEY_KP_5, MP_KEY_KP5},
+    {XKB_KEY_KP_6, MP_KEY_KP6}, {XKB_KEY_KP_7, MP_KEY_KP7},
+    {XKB_KEY_KP_8, MP_KEY_KP8}, {XKB_KEY_KP_9, MP_KEY_KP9},
+    {XKB_KEY_KP_Decimal, MP_KEY_KPDEC}, {XKB_KEY_KP_Separator, MP_KEY_KPDEC},
+
+    /* Numpad without numlock */
+    {XKB_KEY_KP_Insert, MP_KEY_KPINS}, {XKB_KEY_KP_End,       MP_KEY_KP1},
+    {XKB_KEY_KP_Down,   MP_KEY_KP2},   {XKB_KEY_KP_Page_Down, MP_KEY_KP3},
+    {XKB_KEY_KP_Left,   MP_KEY_KP4},   {XKB_KEY_KP_Begin,     MP_KEY_KP5},
+    {XKB_KEY_KP_Right,  MP_KEY_KP6},   {XKB_KEY_KP_Home,      MP_KEY_KP7},
+    {XKB_KEY_KP_Up,     MP_KEY_KP8},   {XKB_KEY_KP_Page_Up,   MP_KEY_KP9},
+    {XKB_KEY_KP_Delete, MP_KEY_KPDEL},
+
+    /* Multimedia keys */
+    {XKB_KEY_XF86MenuKB, MP_KEY_MENU},
+    {XKB_KEY_XF86AudioPlay, MP_KEY_PLAY}, {XKB_KEY_XF86AudioPause, MP_KEY_PAUSE},
+    {XKB_KEY_XF86AudioStop, MP_KEY_STOP},
+    {XKB_KEY_XF86AudioPrev, MP_KEY_PREV}, {XKB_KEY_XF86AudioNext, MP_KEY_NEXT},
+    {XKB_KEY_XF86AudioRewind, MP_KEY_REWIND},
+    {XKB_KEY_XF86AudioForward, MP_KEY_FORWARD},
+    {XKB_KEY_XF86AudioMute, MP_KEY_MUTE},
+    {XKB_KEY_XF86AudioLowerVolume, MP_KEY_VOLUME_DOWN},
+    {XKB_KEY_XF86AudioRaiseVolume, MP_KEY_VOLUME_UP},
+    {XKB_KEY_XF86HomePage, MP_KEY_HOMEPAGE}, {XKB_KEY_XF86WWW, MP_KEY_WWW},
+    {XKB_KEY_XF86Mail, MP_KEY_MAIL}, {XKB_KEY_XF86Favorites, MP_KEY_FAVORITES},
+    {XKB_KEY_XF86Search, MP_KEY_SEARCH}, {XKB_KEY_XF86Sleep, MP_KEY_SLEEP},
+
+    {0, 0}
+};
+
+static void keyboard_handle_keymap(void *data, struct wl_keyboard *wl_keyboard,
+                                   uint32_t format, int32_t fd, uint32_t size)
 {
     struct vo_wayland_state *wl = data;
     char *map_str;
@@ -270,68 +355,97 @@ static void keyboard_handle_keymap(void *data,
         return;
     }
 
-    wl->input.xkb.keymap = xkb_keymap_new_from_string(wl->input.xkb.context,
-                                                      map_str,
-                                                      XKB_KEYMAP_FORMAT_TEXT_V1,
-                                                      0);
+    wl->xkb_keymap = xkb_keymap_new_from_string(wl->xkb_context, map_str,
+                                                XKB_KEYMAP_FORMAT_TEXT_V1, 0);
 
     munmap(map_str, size);
     close(fd);
 
-    if (!wl->input.xkb.keymap) {
+    if (!wl->xkb_keymap) {
         MP_ERR(wl, "failed to compile keymap\n");
         return;
     }
 
-    wl->input.xkb.state = xkb_state_new(wl->input.xkb.keymap);
-    if (!wl->input.xkb.state) {
+    wl->xkb_state = xkb_state_new(wl->xkb_keymap);
+    if (!wl->xkb_state) {
         MP_ERR(wl, "failed to create XKB state\n");
-        xkb_keymap_unref(wl->input.xkb.keymap);
-        wl->input.xkb.keymap = NULL;
+        xkb_keymap_unref(wl->xkb_keymap);
+        wl->xkb_keymap = NULL;
         return;
     }
 }
 
-static void keyboard_handle_enter(void *data,
-                                  struct wl_keyboard *wl_keyboard,
-                                  uint32_t serial,
-                                  struct wl_surface *surface,
+static void keyboard_handle_enter(void *data, struct wl_keyboard *wl_keyboard,
+                                  uint32_t serial, struct wl_surface *surface,
                                   struct wl_array *keys)
 {
 }
 
-static void keyboard_handle_leave(void *data,
-                                  struct wl_keyboard *wl_keyboard,
-                                  uint32_t serial,
-                                  struct wl_surface *surface)
+static void keyboard_handle_leave(void *data, struct wl_keyboard *wl_keyboard,
+                                  uint32_t serial, struct wl_surface *surface)
+{
+}
+
+static bool create_input(struct vo_wayland_state *wl)
 {
+    wl->xkb_context = xkb_context_new(XKB_CONTEXT_NO_FLAGS);
+
+    if (!wl->xkb_context) {
+        MP_ERR(wl, "failed to initialize input: check xkbcommon\n");
+        return 1;
+    }
+
+    return 0;
 }
 
-static void keyboard_handle_key(void *data,
-                                struct wl_keyboard *wl_keyboard,
-                                uint32_t serial,
-                                uint32_t time,
-                                uint32_t key,
+static int lookupkey(int key)
+{
+    const char *passthrough_keys = " -+*/<>`~!@#$%^&()_{}:;\"\',.?\\|=[]";
+
+    int mpkey = 0;
+    if ((key >= 'a' && key <= 'z') || (key >= 'A' && key <= 'Z') ||
+        (key >= '0' && key <= '9') ||
+        (key >  0   && key <  256 && strchr(passthrough_keys, key)))
+        mpkey = key;
+
+    if (!mpkey)
+        mpkey = lookup_keymap_table(keymap, key);
+
+    return mpkey;
+}
+
+static void keyboard_handle_key(void *data, struct wl_keyboard *wl_keyboard,
+                                uint32_t serial, uint32_t time, uint32_t key,
                                 uint32_t state)
 {
     struct vo_wayland_state *wl = data;
 
     uint32_t code = code = key + 8;
-    xkb_keysym_t sym = xkb_state_key_get_one_sym(wl->input.xkb.state, code);
+    xkb_keysym_t sym = xkb_state_key_get_one_sym(wl->xkb_state, code);
 
     int mpmod = state == WL_KEYBOARD_KEY_STATE_PRESSED ? MP_KEY_STATE_DOWN
                                                        : MP_KEY_STATE_UP;
 
-    static const char *mod_names[] = {XKB_MOD_NAME_SHIFT, XKB_MOD_NAME_CTRL,
-                                      XKB_MOD_NAME_ALT, XKB_MOD_NAME_LOGO, 0};
-    static int mods[] = {MP_KEY_MODIFIER_SHIFT, MP_KEY_MODIFIER_CTRL,
-                         MP_KEY_MODIFIER_ALT, MP_KEY_MODIFIER_META, 0};
+    static const char *mod_names[] = {
+        XKB_MOD_NAME_SHIFT,
+        XKB_MOD_NAME_CTRL,
+        XKB_MOD_NAME_ALT,
+        XKB_MOD_NAME_LOGO,
+        0,
+    };
+
+    static int mods[] = {
+        MP_KEY_MODIFIER_SHIFT,
+        MP_KEY_MODIFIER_CTRL,
+        MP_KEY_MODIFIER_ALT,
+        MP_KEY_MODIFIER_META,
+        0,
+    };
 
     for (int n = 0; mods[n]; n++) {
-        xkb_mod_index_t index =
-            xkb_keymap_mod_get_index(wl->input.xkb.keymap, mod_names[n]);
-        if (!xkb_state_mod_index_is_consumed(wl->input.xkb.state, code, index)
-            && xkb_state_mod_index_is_active(wl->input.xkb.state, index,
+        xkb_mod_index_t index = xkb_keymap_mod_get_index(wl->xkb_keymap, mod_names[n]);
+        if (!xkb_state_mod_index_is_consumed(wl->xkb_state, code, index)
+            && xkb_state_mod_index_is_active(wl->xkb_state, index,
                                              XKB_STATE_MODS_DEPRESSED))
             mpmod |= mods[n];
     }
@@ -340,42 +454,29 @@ static void keyboard_handle_key(void *data,
     if (mpkey) {
         mp_input_put_key(wl->vo->input_ctx, mpkey | mpmod);
     } else {
-        char s[80];
+        char s[128];
         if (xkb_keysym_to_utf8(sym, s, sizeof(s)) > 0)
             mp_input_put_key_utf8(wl->vo->input_ctx, mpmod, bstr0(s));
     }
 }
 
-static void keyboard_handle_modifiers(void *data,
-                                      struct wl_keyboard *wl_keyboard,
-                                      uint32_t serial,
-                                      uint32_t mods_depressed,
-                                      uint32_t mods_latched,
-                                      uint32_t mods_locked,
+static void keyboard_handle_modifiers(void *data, struct wl_keyboard *wl_keyboard,
+                                      uint32_t serial, uint32_t mods_depressed,
+                                      uint32_t mods_latched, uint32_t mods_locked,
                                       uint32_t group)
 {
     struct vo_wayland_state *wl = data;
 
-    xkb_state_update_mask(wl->input.xkb.state,
-                          mods_depressed,
-                          mods_latched,
-                          mods_locked,
-                          0, 0, group);
+    xkb_state_update_mask(wl->xkb_state, mods_depressed, mods_latched,
+                          mods_locked, 0, 0, group);
 }
 
-static void keyboard_handle_repeat_info(void *data,
-                                        struct wl_keyboard *wl_keyboard,
-                                        int32_t rate,
-                                        int32_t delay)
+static void keyboard_handle_repeat_info(void *data, struct wl_keyboard *wl_keyboard,
+                                        int32_t rate, int32_t delay)
 {
     struct vo_wayland_state *wl = data;
-    if (wl->vo->opts->native_keyrepeat) {
-        if (rate < 0 || delay < 0) {
-            MP_WARN(wl, "Invalid rate or delay values sent by compositor\n");
-            return;
-        }
+    if (wl->vo->opts->native_keyrepeat)
         mp_input_set_repeat_info(wl->vo->input_ctx, rate, delay);
-    }
 }
 
 static const struct wl_keyboard_listener keyboard_listener = {
@@ -384,562 +485,566 @@ static const struct wl_keyboard_listener keyboard_listener = {
     keyboard_handle_leave,
     keyboard_handle_key,
     keyboard_handle_modifiers,
-    keyboard_handle_repeat_info
+    keyboard_handle_repeat_info,
 };
 
-/* POINTER LISTENER */
-static void pointer_handle_enter(void *data,
-                                 struct wl_pointer *pointer,
-                                 uint32_t serial,
-                                 struct wl_surface *surface,
-                                 wl_fixed_t sx_w,
-                                 wl_fixed_t sy_w)
+static void seat_handle_caps(void *data, struct wl_seat *seat,
+                             enum wl_seat_capability caps)
 {
     struct vo_wayland_state *wl = data;
 
-    wl->cursor.serial = serial;
-    wl->cursor.pointer = pointer;
+    if ((caps & WL_SEAT_CAPABILITY_POINTER) && !wl->pointer) {
+        wl->pointer = wl_seat_get_pointer(seat);
+        wl_pointer_add_listener(wl->pointer, &pointer_listener, wl);
+    } else if (!(caps & WL_SEAT_CAPABILITY_POINTER) && wl->pointer) {
+        wl_pointer_destroy(wl->pointer);
+        wl->pointer = NULL;
+    }
 
-    /* Release the left button on pointer enter again
-     * because after moving the shell surface no release event is sent */
-    mp_input_put_key(wl->vo->input_ctx, MP_KEY_MOUSE_ENTER);
-    mp_input_put_key(wl->vo->input_ctx, MP_MBTN_LEFT | MP_KEY_STATE_UP);
-    show_cursor(wl);
+    if ((caps & WL_SEAT_CAPABILITY_KEYBOARD) && !wl->keyboard) {
+        wl->keyboard = wl_seat_get_keyboard(seat);
+        wl_keyboard_add_listener(wl->keyboard, &keyboard_listener, wl);
+    } else if (!(caps & WL_SEAT_CAPABILITY_KEYBOARD) && wl->keyboard) {
+        wl_keyboard_destroy(wl->keyboard);
+        wl->keyboard = NULL;
+    }
+
+    if ((caps & WL_SEAT_CAPABILITY_TOUCH) && !wl->touch) {
+        wl->touch = wl_seat_get_touch(seat);
+        wl_touch_set_user_data(wl->touch, wl);
+        wl_touch_add_listener(wl->touch, &touch_listener, wl);
+    } else if (!(caps & WL_SEAT_CAPABILITY_TOUCH) && wl->touch) {
+        wl_touch_destroy(wl->touch);
+        wl->touch = NULL;
+    }
 }
 
-static void pointer_handle_leave(void *data,
-                                 struct wl_pointer *pointer,
-                                 uint32_t serial,
-                                 struct wl_surface *surface)
+static const struct wl_seat_listener seat_listener = {
+    seat_handle_caps,
+};
+
+static void output_handle_geometry(void *data, struct wl_output *wl_output,
+                                   int32_t x, int32_t y, int32_t phys_width,
+                                   int32_t phys_height, int32_t subpixel,
+                                   const char *make, const char *model,
+                                   int32_t transform)
 {
-    struct vo_wayland_state *wl = data;
-    mp_input_put_key(wl->vo->input_ctx, MP_KEY_MOUSE_LEAVE);
+    struct vo_wayland_output *output = data;
+    output->make = talloc_strdup(output->wl, make);
+    output->model = talloc_strdup(output->wl, model);
+    output->geometry.x0 = x;
+    output->geometry.y0 = y;
+    output->phys_width = phys_width;
+    output->phys_height = phys_height;
 }
 
-static void pointer_handle_motion(void *data,
-                                  struct wl_pointer *pointer,
-                                  uint32_t time,
-                                  wl_fixed_t sx_w,
-                                  wl_fixed_t sy_w)
+static void output_handle_mode(void *data, struct wl_output *wl_output,
+                               uint32_t flags, int32_t width,
+                               int32_t height, int32_t refresh)
 {
-    int32_t scale = 1;
-    struct vo_wayland_state *wl = data;
-
-    if (wl->display.current_output)
-        scale = wl->display.current_output->scale;
+    struct vo_wayland_output *output = data;
 
-    wl->cursor.pointer = pointer;
-    wl->window.mouse_x = scale*wl_fixed_to_int(sx_w);
-    wl->window.mouse_y = scale*wl_fixed_to_int(sy_w);
+    /* Only save current mode */
+    if (!(flags & WL_OUTPUT_MODE_CURRENT))
+        return;
 
-    mp_input_set_mouse_pos(wl->vo->input_ctx, wl->window.mouse_x,
-                                              wl->window.mouse_y);
+    output->geometry.x1 = width;
+    output->geometry.y1 = height;
+    output->flags = flags;
+    output->refresh_rate = (double)refresh * 0.001;
 }
 
-static void pointer_handle_button(void *data,
-                                  struct wl_pointer *pointer,
-                                  uint32_t serial,
-                                  uint32_t time,
-                                  uint32_t button,
-                                  uint32_t state)
+static void output_handle_done(void* data, struct wl_output *wl_output)
 {
-    struct vo_wayland_state *wl = data;
-
-    state = state == WL_POINTER_BUTTON_STATE_PRESSED ? MP_KEY_STATE_DOWN
-                                                     : MP_KEY_STATE_UP;
-
-    button = button == BTN_LEFT   ? MP_MBTN_LEFT :
-             button == BTN_MIDDLE ? MP_MBTN_MID : MP_MBTN_RIGHT;
-
-    mp_input_put_key(wl->vo->input_ctx, button | state);
-
-    if (!mp_input_test_dragging(wl->vo->input_ctx, wl->window.mouse_x, wl->window.mouse_y) &&
-        (button == MP_MBTN_LEFT) && (state == MP_KEY_STATE_DOWN))
-        window_move(wl, serial);
+    struct vo_wayland_output *o = data;
+
+    o->geometry.x1 += o->geometry.x0;
+    o->geometry.y1 += o->geometry.y0;
+
+    MP_VERBOSE(o->wl, "Registered output %s %s (0x%x):\n"
+               "\tx: %dpx, y: %dpx\n"
+               "\tw: %dpx (%dmm), h: %dpx (%dmm)\n"
+               "\tscale: %d\n"
+               "\tHz: %f\n", o->make, o->model, o->id, o->geometry.x0,
+               o->geometry.y0, mp_rect_w(o->geometry), o->phys_width,
+               mp_rect_h(o->geometry), o->phys_height, o->scale, o->refresh_rate);
 }
 
-static void pointer_handle_axis(void *data,
-                                struct wl_pointer *pointer,
-                                uint32_t time,
-                                uint32_t axis,
-                                wl_fixed_t value)
+static void output_handle_scale(void* data, struct wl_output *wl_output,
+                                int32_t factor)
 {
-    struct vo_wayland_state *wl = data;
-
-    // value is 10.00 on a normal mouse wheel
-    // scale it down to 1.00 for multipliying it with the commands
-    if (axis == WL_POINTER_AXIS_VERTICAL_SCROLL) {
-        if (value > 0)
-            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_DOWN,
-                    wl_fixed_to_double(value)*0.1);
-        if (value < 0)
-            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_UP,
-                    wl_fixed_to_double(value)*-0.1);
-    }
-    else if (axis == WL_POINTER_AXIS_HORIZONTAL_SCROLL) {
-        if (value > 0)
-            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_RIGHT,
-                    wl_fixed_to_double(value)*0.1);
-        if (value < 0)
-            mp_input_put_wheel(wl->vo->input_ctx, MP_WHEEL_LEFT,
-                    wl_fixed_to_double(value)*-0.1);
+    struct vo_wayland_output *output = data;
+    if (!factor) {
+        MP_ERR(output->wl, "Invalid output scale given by the compositor!\n");
+        return;
     }
+    output->scale = factor;
 }
 
-static const struct wl_pointer_listener pointer_listener = {
-    pointer_handle_enter,
-    pointer_handle_leave,
-    pointer_handle_motion,
-    pointer_handle_button,
-    pointer_handle_axis,
+static const struct wl_output_listener output_listener = {
+    output_handle_geometry,
+    output_handle_mode,
+    output_handle_done,
+    output_handle_scale,
 };
 
-static void seat_handle_capabilities(void *data,
-                                     struct wl_seat *seat,
-                                     enum wl_seat_capability caps)
+static void data_offer_handle_offer(void *data, struct wl_data_offer *offer,
+                                    const char *mime_type)
 {
     struct vo_wayland_state *wl = data;
-
-    if ((caps & WL_SEAT_CAPABILITY_KEYBOARD) && !wl->input.keyboard) {
-        wl->input.keyboard = wl_seat_get_keyboard(seat);
-        wl_keyboard_add_listener(wl->input.keyboard, &keyboard_listener, wl);
-    }
-    else if (!(caps & WL_SEAT_CAPABILITY_KEYBOARD) && wl->input.keyboard) {
-        wl_keyboard_destroy(wl->input.keyboard);
-        wl->input.keyboard = NULL;
-    }
-    if ((caps & WL_SEAT_CAPABILITY_POINTER) && !wl->input.pointer) {
-        wl->input.pointer = wl_seat_get_pointer(seat);
-        wl_pointer_add_listener(wl->input.pointer, &pointer_listener, wl);
-    }
-    else if (!(caps & WL_SEAT_CAPABILITY_POINTER) && wl->input.pointer) {
-        wl_pointer_destroy(wl->input.pointer);
-        wl->input.pointer = NULL;
+    int score = mp_event_get_mime_type_score(wl->vo->input_ctx, mime_type);
+    if (score > wl->dnd_mime_score) {
+        wl->dnd_mime_score = score;
+        talloc_free(wl->dnd_mime_type);
+        wl->dnd_mime_type = talloc_strdup(wl, mime_type);
+        MP_VERBOSE(wl, "Given DND offer with mime type %s\n", wl->dnd_mime_type);
     }
 }
 
-static void seat_handle_name(void *data,
-                             struct wl_seat *seat,
-                             const char *name)
+static void data_offer_source_actions(void *data, struct wl_data_offer *offer, uint32_t source_actions)
+{
+
+}
+
+static void data_offer_action(void *data, struct wl_data_offer *wl_data_offer, uint32_t dnd_action)
 {
     struct vo_wayland_state *wl = data;
-    MP_VERBOSE(wl, "Seat \"%s\" connected\n", name);
+    wl->dnd_action = dnd_action & WL_DATA_DEVICE_MANAGER_DND_ACTION_COPY ?
+                     DND_REPLACE : DND_APPEND;
+    MP_VERBOSE(wl, "DND action is %s\n",
+               wl->dnd_action == DND_REPLACE ? "DND_REPLACE" : "DND_APPEND");
 }
 
-static const struct wl_seat_listener seat_listener = {
-    seat_handle_capabilities,
-    seat_handle_name,
+static const struct wl_data_offer_listener data_offer_listener = {
+    data_offer_handle_offer,
+    data_offer_source_actions,
+    data_offer_action,
 };
 
-static void registry_handle_global(void *data, struct wl_registry *reg,
-                                   uint32_t id, const char *interface,
-                                   uint32_t version)
+static void data_device_handle_data_offer(void *data, struct wl_data_device *wl_ddev,
+                                          struct wl_data_offer *id)
 {
     struct vo_wayland_state *wl = data;
+    if (wl->dnd_offer)
+        wl_data_offer_destroy(wl->dnd_offer);
 
-    if (strcmp(interface, "wl_compositor") == 0) {
-
-        wl->display.compositor = wl_registry_bind(reg, id,
-                                                  &wl_compositor_interface,
-                                                  MPMIN(3, version));
-    }
-
-    else if (strcmp(interface, "wl_shell") == 0) {
-
-        wl->display.shell = wl_registry_bind(reg, id, &wl_shell_interface, 1);
-    }
-
-    else if (strcmp(interface, "wl_shm") == 0) {
-
-        wl->display.shm = wl_registry_bind(reg, id, &wl_shm_interface, 1);
-    }
-
-    else if (strcmp(interface, "wl_output") == 0) {
-
-        struct vo_wayland_output *output =
-            talloc_zero(wl, struct vo_wayland_output);
-
-        output->id = id;
-        output->scale = 1;
-        output->output = wl_registry_bind(reg, id, &wl_output_interface,
-                                          MPMIN(2, version));
+    wl->dnd_offer = id;
+    wl_data_offer_add_listener(id, &data_offer_listener, wl);
+}
 
-        wl_output_add_listener(output->output, &output_listener, output);
-        wl_list_insert(&wl->display.output_list, &output->link);
+static void data_device_handle_enter(void *data, struct wl_data_device *wl_ddev,
+                                     uint32_t serial, struct wl_surface *surface,
+                                     wl_fixed_t x, wl_fixed_t y,
+                                     struct wl_data_offer *id)
+{
+    struct vo_wayland_state *wl = data;
+    if (wl->dnd_offer != id) {
+        MP_FATAL(wl, "DND offer ID mismatch!\n");
+        return;
     }
 
-    else if (strcmp(interface, "wl_seat") == 0) {
-
-        wl->input.seat = wl_registry_bind(reg, id, &wl_seat_interface, 4);
-        wl_seat_add_listener(wl->input.seat, &seat_listener, wl);
-
-    }
+    wl_data_offer_set_actions(id, WL_DATA_DEVICE_MANAGER_DND_ACTION_COPY |
+                                  WL_DATA_DEVICE_MANAGER_DND_ACTION_MOVE,
+                                  WL_DATA_DEVICE_MANAGER_DND_ACTION_COPY);
 
-    else if (strcmp(interface, "wl_subcompositor") == 0) {
+    wl_data_offer_accept(id, serial, wl->dnd_mime_type);
 
-        wl->display.subcomp = wl_registry_bind(reg, id,
-                                               &wl_subcompositor_interface, 1);
-    }
+    MP_VERBOSE(wl, "Accepting DND offer with mime type %s\n", wl->dnd_mime_type);
 }
 
-static void registry_handle_global_remove(void *data,
-                                          struct wl_registry *registry,
-                                          uint32_t id)
+static void data_device_handle_leave(void *data, struct wl_data_device *wl_ddev)
 {
-}
+    struct vo_wayland_state *wl = data;
 
-static const struct wl_registry_listener registry_listener = {
-    registry_handle_global,
-    registry_handle_global_remove
-};
+    if (wl->dnd_offer) {
+        if (wl->dnd_fd != -1)
+            return;
+        wl_data_offer_destroy(wl->dnd_offer);
+        wl->dnd_offer = NULL;
+    }
 
+    MP_VERBOSE(wl, "Releasing DND offer with mime type %s\n", wl->dnd_mime_type);
 
-/*** internal functions ***/
+    talloc_free(wl->dnd_mime_type);
+    wl->dnd_mime_type = NULL;
+    wl->dnd_mime_score = 0;
+}
 
-static int lookupkey(int key)
+static void data_device_handle_motion(void *data, struct wl_data_device *wl_ddev,
+                                      uint32_t time, wl_fixed_t x, wl_fixed_t y)
 {
-    const char *passthrough_keys = " -+*/<>`~!@#$%^&()_{}:;\"\',.?\\|=[]";
-
-    int mpkey = 0;
-    if ((key >= 'a' && key <= 'z') ||
-        (key >= 'A' && key <= 'Z') ||
-        (key >= '0' && key <= '9') ||
-        (key >  0   && key <  256 && strchr(passthrough_keys, key)))
-        mpkey = key;
-
-    if (!mpkey)
-        mpkey = lookup_keymap_table(keymap, key);
+    struct vo_wayland_state *wl = data;
 
-    return mpkey;
+    wl_data_offer_accept(wl->dnd_offer, time, wl->dnd_mime_type);
 }
 
-static void hide_cursor (struct vo_wayland_state *wl)
+static void data_device_handle_drop(void *data, struct wl_data_device *wl_ddev)
 {
-    if (!wl->cursor.pointer)
-        return;
+    struct vo_wayland_state *wl = data;
 
-    wl_pointer_set_cursor(wl->cursor.pointer, wl->cursor.serial, NULL, 0, 0);
-}
+    int pipefd[2];
 
-static void show_cursor (struct vo_wayland_state *wl)
-{
-    if (!wl->cursor.pointer)
+    if (pipe2(pipefd, O_CLOEXEC) == -1) {
+        MP_ERR(wl, "Failed to create dnd pipe!\n");
         return;
+    }
 
-    struct wl_cursor_image *image  = wl->cursor.default_cursor->images[0];
-    struct wl_buffer *buffer = wl_cursor_image_get_buffer(image);
-
-    wl_pointer_set_cursor(wl->cursor.pointer,
-                          wl->cursor.serial,
-                          wl->cursor.surface,
-                          image->hotspot_x,
-                          image->hotspot_y);
+    MP_VERBOSE(wl, "Receiving DND offer with mime %s\n", wl->dnd_mime_type);
 
-    wl_surface_attach(wl->cursor.surface, buffer, 0, 0);
-    wl_surface_damage(wl->cursor.surface, 0, 0, image->width, image->height);
-    wl_surface_commit(wl->cursor.surface);
-}
+    wl_data_offer_receive(wl->dnd_offer, wl->dnd_mime_type, pipefd[1]);
+    close(pipefd[1]);
 
-static void window_move(struct vo_wayland_state *wl, uint32_t serial)
-{
-    if (wl->display.shell)
-        wl_shell_surface_move(wl->window.shell_surface, wl->input.seat, serial);
+    wl->dnd_fd = pipefd[0];
 }
 
-static void window_set_toplevel(struct vo_wayland_state *wl)
+static void data_device_handle_selection(void *data, struct wl_data_device *wl_ddev,
+                                         struct wl_data_offer *id)
 {
-    if (wl->display.shell)
-        wl_shell_surface_set_toplevel(wl->window.shell_surface);
 }
 
-static void window_set_title(struct vo_wayland_state *wl, const char *title)
-{
-    if (wl->display.shell)
-        wl_shell_surface_set_title(wl->window.shell_surface, title);
-}
+static const struct wl_data_device_listener data_device_listener = {
+    data_device_handle_data_offer,
+    data_device_handle_enter,
+    data_device_handle_leave,
+    data_device_handle_motion,
+    data_device_handle_drop,
+    data_device_handle_selection,
+};
 
-static void schedule_resize(struct vo_wayland_state *wl,
-                            uint32_t edges,
-                            int32_t width,
-                            int32_t height)
+static void surface_handle_enter(void *data, struct wl_surface *wl_surface,
+                                 struct wl_output *output)
 {
-    int32_t minimum_size = 150;
-    int32_t x, y;
-    float win_aspect = wl->window.aspect;
-    if (win_aspect <= 0)
-        win_aspect = 1;
-
-    MP_DBG(wl, "schedule resize: %dx%d\n", width, height);
-
-    width  = MPMAX(minimum_size,  width);
-    height = MPMAX(minimum_size, height);
-    if (wl->display.current_output) {
-        int scale = wl->display.current_output->scale;
-        width  = MPMIN(width,  wl->display.current_output->width /scale);
-        height = MPMIN(height, wl->display.current_output->height/scale);
-    }
+    struct vo_wayland_state *wl = data;
+    wl->current_output = NULL;
 
-    // don't keep the aspect ratio in fullscreen mode because the compositor
-    // shows the desktop in the border regions if the video does not have the same
-    // aspect ratio as the screen
-    /* if only the height is changed we have to calculate the width
-     * in any other case we calculate the height */
-    switch (edges) {
-        case WL_SHELL_SURFACE_RESIZE_TOP:
-        case WL_SHELL_SURFACE_RESIZE_BOTTOM:
-            width = win_aspect * height;
-            break;
-        case WL_SHELL_SURFACE_RESIZE_LEFT:
-        case WL_SHELL_SURFACE_RESIZE_RIGHT:
-        case WL_SHELL_SURFACE_RESIZE_TOP_LEFT:    // just a preference
-        case WL_SHELL_SURFACE_RESIZE_TOP_RIGHT:
-        case WL_SHELL_SURFACE_RESIZE_BOTTOM_LEFT:
-        case WL_SHELL_SURFACE_RESIZE_BOTTOM_RIGHT:
-            height = (1 / win_aspect) * width;
+    struct vo_wayland_output *o;
+    wl_list_for_each(o, &wl->output_list, link) {
+        if (o->output == output) {
+            wl->current_output = o;
             break;
+        }
     }
 
-    if (edges & WL_SHELL_SURFACE_RESIZE_LEFT)
-        x = wl->window.width - width;
-    else
-        x = 0;
+    wl->current_output->has_surface = true;
+    if (wl->scaling != wl->current_output->scale)
+        wl->pending_vo_events |= VO_EVENT_RESIZE;
+    wl->scaling = wl->current_output->scale;
 
-    if (edges & WL_SHELL_SURFACE_RESIZE_TOP)
-        y = wl->window.height - height;
-    else
-        y = 0;
+    MP_VERBOSE(wl, "Surface entered output %s %s (0x%x), scale = %i\n", o->make,
+               o->model, o->id, wl->scaling);
 
-    wl->window.sh_width = width;
-    wl->window.sh_height = height;
-    wl->window.sh_x = x;
-    wl->window.sh_y = y;
-    wl->window.events |= VO_EVENT_RESIZE;
+    wl->pending_vo_events |= VO_EVENT_WIN_STATE;
 }
 
-static void frame_callback(void *data,
-                           struct wl_callback *callback,
-                           uint32_t time)
+static void surface_handle_leave(void *data, struct wl_surface *wl_surface,
+                                 struct wl_output *output)
 {
     struct vo_wayland_state *wl = data;
 
-    if (wl->frame.function)
-        wl->frame.function(wl->frame.data, time);
+    struct vo_wayland_output *o;
+    wl_list_for_each(o, &wl->output_list, link) {
+        if (o->output == output) {
+            o->has_surface = false;
+            wl->pending_vo_events |= VO_EVENT_WIN_STATE;
+            return;
+        }
+    }
+}
+
+static const struct wl_surface_listener surface_listener = {
+    surface_handle_enter,
+    surface_handle_leave,
+};
+
+static const struct wl_callback_listener frame_listener;
+
+static void frame_callback(void *data, struct wl_callback *callback, uint32_t time)
+{
+    struct vo_wayland_state *wl = data;
 
     if (callback)
         wl_callback_destroy(callback);
 
-    wl->frame.callback = wl_surface_frame(wl->window.video_surface);
-
-    if (!wl->frame.callback) {
-        MP_ERR(wl, "wl_surface_frame failed\n");
-        return;
-    }
+    wl->frame_callback = wl_surface_frame(wl->surface);
+    wl_callback_add_listener(wl->frame_callback, &frame_listener, wl);
 
-    wl_callback_add_listener(wl->frame.callback, &frame_listener, wl);
-    wl_surface_commit(wl->window.video_surface);
+    if (!vo_render_frame_external(wl->vo))
+        wl_surface_commit(wl->surface);
 }
 
 static const struct wl_callback_listener frame_listener = {
-    frame_callback
+    frame_callback,
 };
 
-static bool create_display(struct vo_wayland_state *wl)
+static void registry_handle_add(void *data, struct wl_registry *reg, uint32_t id,
+                                const char *interface, uint32_t ver)
 {
-    if (wl->vo->probing && !getenv("XDG_RUNTIME_DIR"))
-        return false;
-
-    wl->display.display = wl_display_connect(NULL);
-
-    if (!wl->display.display) {
-        MP_MSG(wl, wl->vo->probing ? MSGL_V : MSGL_ERR,
-               "failed to connect to a wayland server: "
-               "check if a wayland compositor is running\n");
+    int found = 1;
+    struct vo_wayland_state *wl = data;
 
-        return false;
+    if (!strcmp(interface, wl_compositor_interface.name) && found++) {
+        ver = MPMIN(ver, 4); /* Cap the version */
+        wl->compositor = wl_registry_bind(reg, id, &wl_compositor_interface, ver);
+        wl->surface = wl_compositor_create_surface(wl->compositor);
+        wl->cursor_surface = wl_compositor_create_surface(wl->compositor);
+        wl_surface_add_listener(wl->surface, &surface_listener, wl);
+        vo_enable_external_renderloop(wl->vo);
+        wl->frame_callback = wl_surface_frame(wl->surface);
+        wl_callback_add_listener(wl->frame_callback, &frame_listener, wl);
     }
 
-    wl->display.registry = wl_display_get_registry(wl->display.display);
-    wl_registry_add_listener(wl->display.registry, &registry_listener, wl);
-
-    wl_display_roundtrip(wl->display.display);
+    if (!strcmp(interface, wl_output_interface.name) && (ver >= 2) && found++) {
+        struct vo_wayland_output *output = talloc_zero(wl, struct vo_wayland_output);
 
-    wl->display.display_fd = wl_display_get_fd(wl->display.display);
+        output->wl     = wl;
+        output->id     = id;
+        output->scale  = 1;
+        output->output = wl_registry_bind(reg, id, &wl_output_interface, 2);
 
-    return true;
-}
-
-static void destroy_display(struct vo_wayland_state *wl)
-{
-    struct vo_wayland_output *output = NULL;
-    struct vo_wayland_output *tmp = NULL;
-
-    wl_list_for_each_safe(output, tmp, &wl->display.output_list, link) {
-        if (output && output->output) {
-            wl_output_destroy(output->output);
-            output->output = NULL;
-            wl_list_remove(&output->link);
-        }
+        wl_output_add_listener(output->output, &output_listener, output);
+        wl_list_insert(&wl->output_list, &output->link);
     }
 
-    if (wl->display.shm)
-        wl_shm_destroy(wl->display.shm);
+    if (!strcmp(interface, zxdg_shell_v6_interface.name) && found++) {
+        wl->shell = wl_registry_bind(reg, id, &zxdg_shell_v6_interface, 1);
+        zxdg_shell_v6_add_listener(wl->shell, &xdg_shell_listener, wl);
+    }
 
-    if (wl->display.shell)
-        wl_shell_destroy(wl->display.shell);
+    if (!strcmp(interface, wl_seat_interface.name) && found++) {
+        wl->seat = wl_registry_bind(reg, id, &wl_seat_interface, 1);
+        wl_seat_add_listener(wl->seat, &seat_listener, wl);
+    }
 
-    if (wl->display.subcomp)
-        wl_subcompositor_destroy(wl->display.subcomp);
+    if (!strcmp(interface, wl_shm_interface.name) && found++) {
+        wl->shm = wl_registry_bind(reg, id, &wl_shm_interface, 1);
+    }
 
-    if (wl->display.compositor)
-        wl_compositor_destroy(wl->display.compositor);
+    if (!strcmp(interface, wl_data_device_manager_interface.name) && (ver >= 3) && found++) {
+        wl->dnd_devman = wl_registry_bind(reg, id, &wl_data_device_manager_interface, 3);
+    }
 
-    if (wl->display.registry)
-        wl_registry_destroy(wl->display.registry);
+    if (!strcmp(interface, org_kde_kwin_server_decoration_manager_interface.name) && found++) {
+        wl->server_decoration_manager = wl_registry_bind(reg, id, &org_kde_kwin_server_decoration_manager_interface, 1);
+    }
 
-    if (wl->display.display) {
-        wl_display_flush(wl->display.display);
-        wl_display_disconnect(wl->display.display);
+    if (!strcmp(interface, zwp_idle_inhibit_manager_v1_interface.name) && found++) {
+        wl->idle_inhibit_manager = wl_registry_bind(reg, id, &zwp_idle_inhibit_manager_v1_interface, 1);
     }
+
+    if (found > 1)
+        MP_VERBOSE(wl, "Registered for protocol %s\n", interface);
 }
 
-static bool create_window(struct vo_wayland_state *wl)
+static void remove_output(struct vo_wayland_output *out)
 {
-    wl->window.video_surface =
-        wl_compositor_create_surface(wl->display.compositor);
-
-    wl_surface_add_listener(wl->window.video_surface,
-                            &surface_listener, wl);
+    if (!out)
+        return;
 
-    if (wl->display.shell) {
-        wl->window.shell_surface = wl_shell_get_shell_surface(wl->display.shell,
-                                                              wl->window.video_surface);
+    MP_VERBOSE(out->wl, "Deregistering output %s %s (0x%x)\n", out->make,
+               out->model, out->id);
+    wl_list_remove(&out->link);
+    talloc_free(out->make);
+    talloc_free(out->model);
+    talloc_free(out);
+    return;
+}
 
-        if (!wl->window.shell_surface) {
-            MP_ERR(wl, "creating shell surface failed\n");
-            return false;
+static void registry_handle_remove(void *data, struct wl_registry *reg, uint32_t id)
+{
+    struct vo_wayland_state *wl = data;
+    struct vo_wayland_output *output, *tmp;
+    wl_list_for_each_safe(output, tmp, &wl->output_list, link) {
+        if (output->id == id) {
+            remove_output(output);
+            return;
         }
-
-        wl_shell_surface_add_listener(wl->window.shell_surface,
-                                      &shell_surface_listener, wl);
-
-        wl_shell_surface_set_toplevel(wl->window.shell_surface);
-        wl_shell_surface_set_class(wl->window.shell_surface, "mpv");
     }
-
-    return true;
 }
 
-static void destroy_window(struct vo_wayland_state *wl)
-{
-    if (wl->window.shell_surface)
-        wl_shell_surface_destroy(wl->window.shell_surface);
-
-    if (wl->window.video_surface)
-        wl_surface_destroy(wl->window.video_surface);
+static const struct wl_registry_listener registry_listener = {
+    registry_handle_add,
+    registry_handle_remove,
+};
 
-    if (wl->frame.callback)
-        wl_callback_destroy(wl->frame.callback);
+static void handle_surface_config(void *data, struct zxdg_surface_v6 *surface,
+                                  uint32_t serial)
+{
+    zxdg_surface_v6_ack_configure(surface, serial);
 }
 
-static bool create_cursor(struct vo_wayland_state *wl)
+static const struct zxdg_surface_v6_listener xdg_surface_listener = {
+    handle_surface_config,
+};
+
+static void handle_toplevel_config(void *data, struct zxdg_toplevel_v6 *toplevel,
+                                   int32_t width, int32_t height, struct wl_array *states)
 {
-    if (!wl->display.shm) {
-        MP_ERR(wl->vo, "no shm interface available\n");
-        return false;
+    struct vo_wayland_state *wl = data;
+    struct mp_rect old_geometry = wl->geometry;
+
+    int prev_fs_state = wl->fullscreen;
+    bool maximized = false;
+    wl->fullscreen = false;
+    enum zxdg_toplevel_v6_state *state;
+    wl_array_for_each(state, states) {
+        switch (*state) {
+        case ZXDG_TOPLEVEL_V6_STATE_FULLSCREEN:
+            wl->fullscreen = true;
+            break;
+        case ZXDG_TOPLEVEL_V6_STATE_RESIZING:
+            wl->pending_vo_events |= VO_EVENT_LIVE_RESIZING;
+            break;
+        case ZXDG_TOPLEVEL_V6_STATE_MAXIMIZED:
+            maximized = true;
+            break;
+        case ZXDG_TOPLEVEL_V6_STATE_ACTIVATED:
+            break;
+        }
     }
 
-    wl->cursor.surface =
-        wl_compositor_create_surface(wl->display.compositor);
+    if (prev_fs_state != wl->fullscreen)
+        wl->pending_vo_events |= VO_EVENT_FULLSCREEN_STATE;
+    if (!(wl->pending_vo_events & VO_EVENT_LIVE_RESIZING))
+        vo_query_and_reset_events(wl->vo, VO_EVENT_LIVE_RESIZING);
+
+    if (width > 0 && height > 0) {
+        if (!wl->fullscreen) {
+            if (wl->vo->opts->keepaspect && wl->vo->opts->keepaspect_window &&
+                !maximized) {
+                if (width > height)
+                    width  = height * wl->aspect_ratio;
+                else
+                    height =  width / wl->aspect_ratio;
+            }
+            wl->window_size.x0 = 0;
+            wl->window_size.y0 = 0;
+            wl->window_size.x1 = width;
+            wl->window_size.y1 = height;
+        }
+        wl->geometry.x0 = 0;
+        wl->geometry.y0 = 0;
+        wl->geometry.x1 = width;
+        wl->geometry.y1 = height;
+    } else {
+        wl->geometry = wl->window_size;
+    }
 
-    if (!wl->cursor.surface)
-        return false;
+    if (mp_rect_equals(&old_geometry, &wl->geometry))
+        return;
 
-    wl->cursor.theme = wl_cursor_theme_load(NULL, 32, wl->display.shm);
-    wl->cursor.default_cursor = wl_cursor_theme_get_cursor(wl->cursor.theme,
-                                                           "left_ptr");
+    MP_VERBOSE(wl, "Resizing due to xdg from %ix%i to %ix%i\n",
+               mp_rect_w(old_geometry)*wl->scaling, mp_rect_h(old_geometry)*wl->scaling,
+               mp_rect_w(wl->geometry)*wl->scaling, mp_rect_h(wl->geometry)*wl->scaling);
 
-    return true;
+    wl->pending_vo_events |= VO_EVENT_RESIZE;
 }
 
-static void destroy_cursor(struct vo_wayland_state *wl)
+static void handle_toplevel_close(void *data, struct zxdg_toplevel_v6 *xdg_toplevel)
 {
-    if (wl->cursor.theme)
-        wl_cursor_theme_destroy(wl->cursor.theme);
-
-    if (wl->cursor.surface)
-        wl_surface_destroy(wl->cursor.surface);
+    struct vo_wayland_state *wl = data;
+    mp_input_put_key(wl->vo->input_ctx, MP_KEY_CLOSE_WIN);
 }
 
-static bool create_input(struct vo_wayland_state *wl)
+static const struct zxdg_toplevel_v6_listener xdg_toplevel_listener = {
+    handle_toplevel_config,
+    handle_toplevel_close,
+};
+
+static int create_xdg_surface(struct vo_wayland_state *wl)
 {
-    wl->input.xkb.context = xkb_context_new(0);
+    wl->xdg_surface = zxdg_shell_v6_get_xdg_surface(wl->shell, wl->surface);
+    zxdg_surface_v6_add_listener(wl->xdg_surface, &xdg_surface_listener, wl);
 
-    if (!wl->input.xkb.context) {
-        MP_ERR(wl, "failed to initialize input: check xkbcommon\n");
-        return false;
-    }
+    wl->xdg_toplevel = zxdg_surface_v6_get_toplevel(wl->xdg_surface);
+    zxdg_toplevel_v6_add_listener(wl->xdg_toplevel, &xdg_toplevel_listener, wl);
 
-    return true;
+    zxdg_toplevel_v6_set_title (wl->xdg_toplevel, "mpv");
+    zxdg_toplevel_v6_set_app_id(wl->xdg_toplevel, "mpv");
+
+    return 0;
 }
 
-static void destroy_input(struct vo_wayland_state *wl)
+static int set_border_decorations(struct vo_wayland_state *wl, int state)
 {
-    if (wl->input.keyboard) {
-        wl_keyboard_destroy(wl->input.keyboard);
-        xkb_keymap_unref(wl->input.xkb.keymap);
-        xkb_state_unref(wl->input.xkb.state);
+    if (!wl->server_decoration)
+        return VO_NOTIMPL;
+    enum org_kde_kwin_server_decoration_mode mode;
+    if (state) {
+        MP_VERBOSE(wl, "Enabling server decorations\n");
+        mode = ORG_KDE_KWIN_SERVER_DECORATION_MODE_SERVER;
+    } else {
+        MP_VERBOSE(wl, "Disabling server decorations\n");
+        mode = ORG_KDE_KWIN_SERVER_DECORATION_MODE_NONE;
     }
-
-    if (wl->input.xkb.context)
-        xkb_context_unref(wl->input.xkb.context);
-
-    if (wl->input.pointer)
-        wl_pointer_destroy(wl->input.pointer);
-
-    if (wl->input.seat)
-        wl_seat_destroy(wl->input.seat);
+    org_kde_kwin_server_decoration_request_mode(wl->server_decoration, mode);
+    return VO_TRUE;
 }
 
-/*** mplayer2 interface ***/
-
 int vo_wayland_init(struct vo *vo)
 {
-    vo->wayland = talloc_zero(NULL, struct vo_wayland_state);
-    struct vo_wayland_state *wl = vo->wayland;
-    *wl = (struct vo_wayland_state){
+    vo->wl = talloc_zero(NULL, struct vo_wayland_state);
+    struct vo_wayland_state *wl = vo->wl;
+
+    *wl = (struct vo_wayland_state) {
+        .display = wl_display_connect(NULL),
         .vo = vo,
         .log = mp_log_new(wl, vo->log, "wayland"),
+        .scaling = 1,
         .wakeup_pipe = {-1, -1},
+        .dnd_fd = -1,
     };
 
-    wl_list_init(&wl->display.output_list);
+    wl_list_init(&wl->output_list);
+
+    if (!wl->display)
+        return false;
+
+    if (create_input(wl))
+        return false;
+
+    wl->registry = wl_display_get_registry(wl->display);
+    wl_registry_add_listener(wl->registry, &registry_listener, wl);
+
+    /* Do a roundtrip to run the registry */
+    wl_display_roundtrip(wl->display);
+
+    if (!wl->shell) {
+        MP_FATAL(wl, "Compositor doesn't support the required %s protocol!\n",
+                 zxdg_shell_v6_interface.name);
+        return false;
+    }
+
+    if (!wl_list_length(&wl->output_list)) {
+        MP_FATAL(wl, "No outputs found or compositor doesn't support %s (ver. 2)\n",
+                 wl_output_interface.name);
+        return false;
+    }
 
-    if (!create_input(wl)
-        || !create_display(wl)
-        || !create_window(wl)
-        || !create_cursor(wl))
-    {
-        vo_wayland_uninit(vo);
+    /* Can't be initialized during registry due to multi-protocol dependence */
+    if (create_xdg_surface(wl))
         return false;
+
+    if (wl->dnd_devman) {
+        wl->dnd_ddev = wl_data_device_manager_get_data_device(wl->dnd_devman, wl->seat);
+        wl_data_device_add_listener(wl->dnd_ddev, &data_device_listener, wl);
+    } else {
+        MP_VERBOSE(wl, "Compositor doesn't support the %s (ver. 3) protocol!\n",
+                   wl_data_device_manager_interface.name);
     }
 
-    // create_display's roundtrip only adds the interfaces
-    // the second roundtrip receives output modes, geometry and more ...
-    wl_display_roundtrip(wl->display.display);
-
-    struct vo_wayland_output *o = NULL;
-    wl_list_for_each(o, &wl->display.output_list, link) {
-        MP_VERBOSE(wl, "output received:\n"
-                       "\tvendor: %s\n"
-                       "\tmodel: %s\n"
-                       "\tw: %d, h: %d\n"
-                       "\tscale: %d\n"
-                       "\tHz: %f\n",
-                       o->make, o->model,
-                       o->width, o->height, o->scale,
-                       o->refresh_rate / 1000.0f);
+    if (wl->server_decoration_manager) {
+        wl->server_decoration = org_kde_kwin_server_decoration_manager_create(wl->server_decoration_manager, wl->surface);
+        set_border_decorations(wl, vo->opts->border);
+    } else {
+        MP_VERBOSE(wl, "Compositor doesn't support the %s protocol!\n",
+                   org_kde_kwin_server_decoration_manager_interface.name);
     }
 
+    if (!wl->idle_inhibit_manager)
+        MP_VERBOSE(wl, "Compositor doesn't support the %s protocol!\n",
+                   zwp_idle_inhibit_manager_v1_interface.name);
+
+    wl->display_fd = wl_display_get_fd(wl->display);
     mp_make_wakeup_pipe(wl->wakeup_pipe);
 
     return true;
@@ -947,220 +1052,340 @@ int vo_wayland_init(struct vo *vo)
 
 void vo_wayland_uninit(struct vo *vo)
 {
-    struct vo_wayland_state *wl = vo->wayland;
-    destroy_cursor(wl);
-    destroy_window(wl);
-    destroy_input(wl);
-    destroy_display(wl);
+    struct vo_wayland_state *wl = vo->wl;
+    if (!wl)
+        return;
+
+    mp_input_put_key(wl->vo->input_ctx, MP_INPUT_RELEASE_ALL);
+
+    if (wl->cursor_theme)
+        wl_cursor_theme_destroy(wl->cursor_theme);
+
+    if (wl->cursor_surface)
+        wl_surface_destroy(wl->cursor_surface);
+
+    if (wl->xkb_context)
+        xkb_context_unref(wl->xkb_context);
+
+    if (wl->idle_inhibitor)
+        zwp_idle_inhibitor_v1_destroy(wl->idle_inhibitor);
+
+    if (wl->idle_inhibit_manager)
+        zwp_idle_inhibit_manager_v1_destroy(wl->idle_inhibit_manager);
+
+    if (wl->shell)
+        zxdg_shell_v6_destroy(wl->shell);
+
+    if (wl->shm)
+        wl_shm_destroy(wl->shm);
+
+    if (wl->dnd_devman)
+        wl_data_device_manager_destroy(wl->dnd_devman);
+
+    if (wl->server_decoration)
+        org_kde_kwin_server_decoration_destroy(wl->server_decoration);
+
+    if (wl->server_decoration_manager)
+        org_kde_kwin_server_decoration_manager_destroy(wl->server_decoration_manager);
+
+    if (wl->surface)
+        wl_surface_destroy(wl->surface);
+
+    if (wl->frame_callback)
+        wl_callback_destroy(wl->frame_callback);
+
+    if (wl->display) {
+        close(wl_display_get_fd(wl->display));
+        wl_display_disconnect(wl->display);
+    }
+
+    struct vo_wayland_output *output, *tmp;
+    wl_list_for_each_safe(output, tmp, &wl->output_list, link)
+        remove_output(output);
+
+    talloc_free(wl->dnd_mime_type);
+
     for (int n = 0; n < 2; n++)
         close(wl->wakeup_pipe[n]);
     talloc_free(wl);
-    vo->wayland = NULL;
+    vo->wl = NULL;
 }
 
-static void vo_wayland_ontop(struct vo *vo)
+static struct vo_wayland_output *find_output(struct vo_wayland_state *wl, int index)
 {
-    struct vo_wayland_state *wl = vo->wayland;
-    if (!vo->opts->ontop)
-        return;
-    MP_DBG(wl, "going ontop\n");
-    window_set_toplevel(wl);
-    schedule_resize(wl, 0, wl->window.width, wl->window.height);
+    int screen_id = 0;
+    struct vo_wayland_output *output;
+    wl_list_for_each(output, &wl->output_list, link) {
+        if (index == screen_id++)
+            return output;
+    }
+    return NULL;
 }
 
-static void vo_wayland_fullscreen(struct vo *vo)
+int vo_wayland_reconfig(struct vo *vo)
 {
-    struct vo_wayland_state *wl = vo->wayland;
-    if (!wl->display.shell)
-        return;
+    struct wl_output *wl_out = NULL;
+    struct mp_rect screenrc = { 0 };
+    struct vo_wayland_state *wl = vo->wl;
+
+    MP_VERBOSE(wl, "Reconfiguring!\n");
+
+    /* Surface enter events happen later but we already know the outputs and we'd
+     * like to know the output the surface would be on (for scaling or fullscreen),
+     * so if fsscreen_id is set or there's only one possible output, use it. */
+    if (((!wl->current_output) && (wl_list_length(&wl->output_list) == 1)) ||
+        (vo->opts->fullscreen && (vo->opts->fsscreen_id >= 0))) {
+        int idx = 0;
+        if (vo->opts->fullscreen && (vo->opts->fsscreen_id >= 0))
+            idx = vo->opts->fsscreen_id;
+        struct vo_wayland_output *out = find_output(wl, idx);
+        if (!out) {
+            MP_ERR(wl, "Screen index %i not found/unavailable!\n", idx);
+        } else {
+            wl_out = out->output;
+            wl->current_output = out;
+            wl->scaling = out->scale;
+            screenrc = wl->current_output->geometry;
+        }
+    }
+
+    struct vo_win_geometry geo;
+    vo_calc_window_geometry(vo, &screenrc, &geo);
+    vo_apply_window_geometry(vo, &geo);
 
-    struct wl_output *fs_output = wl->display.fs_output;
+    wl->geometry.x0  = 0;
+    wl->geometry.y0  = 0;
+    wl->geometry.x1  = vo->dwidth / wl->scaling;
+    wl->geometry.y1  = vo->dheight / wl->scaling;
+    wl->window_size  = wl->geometry;
+    wl->aspect_ratio = vo->dwidth / (float)vo->dheight;
 
     if (vo->opts->fullscreen) {
-        MP_DBG(wl, "going fullscreen\n");
-        wl->window.is_fullscreen = true;
-        wl->window.p_width = wl->window.width;
-        wl->window.p_height = wl->window.height;
-        if (wl->display.current_output)
-            schedule_resize(wl, 0, wl->display.current_output->width,
-                            wl->display.current_output->height);
-        wl_shell_surface_set_fullscreen(wl->window.shell_surface,
-                WL_SHELL_SURFACE_FULLSCREEN_METHOD_DEFAULT,
-                0, fs_output);
+        /* If already fullscreen, fix resolution for the frame size change */
+        if (wl->fullscreen && wl->current_output) {
+            wl->geometry.x0  = 0;
+            wl->geometry.y0  = 0;
+            wl->geometry.x1  = mp_rect_w(wl->current_output->geometry)/wl->scaling;
+            wl->geometry.y1  = mp_rect_h(wl->current_output->geometry)/wl->scaling;
+        } else {
+            zxdg_toplevel_v6_set_fullscreen(wl->xdg_toplevel, wl_out);
+        }
     }
 
-    else {
-        MP_DBG(wl, "leaving fullscreen\n");
-        wl->window.is_fullscreen = false;
-        window_set_toplevel(wl);
-        schedule_resize(wl, 0, wl->window.p_width, wl->window.p_height);
+    wl_surface_set_buffer_scale(wl->surface, wl->scaling);
+    wl_surface_commit(wl->surface);
+    wl->pending_vo_events |= VO_EVENT_RESIZE;
+    if (!wl->configured) {
+        if (spawn_cursor(wl))
+            return false;
+        wl_display_roundtrip(wl->display);
+        wl->configured = true;
     }
+
+    return true;
 }
 
-static void vo_wayland_update_screeninfo(struct vo *vo, struct mp_rect *screenrc)
+static int set_screensaver_inhibitor(struct vo_wayland_state *wl, int state)
 {
-    struct vo_wayland_state *wl = vo->wayland;
-    struct mp_vo_opts *opts = vo->opts;
-
-    *screenrc = (struct mp_rect){0};
+    if (!wl->idle_inhibit_manager)
+        return VO_NOTIMPL;
+    if (state == (!!wl->idle_inhibitor))
+        return VO_TRUE;
+    if (state) {
+        MP_VERBOSE(wl, "Enabling idle inhibitor\n");
+        struct zwp_idle_inhibit_manager_v1 *mgr = wl->idle_inhibit_manager;
+        wl->idle_inhibitor = zwp_idle_inhibit_manager_v1_create_inhibitor(mgr, wl->surface);
+    } else {
+        MP_VERBOSE(wl, "Disabling the idle inhibitor\n");
+        zwp_idle_inhibitor_v1_destroy(wl->idle_inhibitor);
+    }
+    return VO_TRUE;
+}
 
-    int screen_id = 0;
+static int toggle_fullscreen(struct vo_wayland_state *wl)
+{
+    if (!wl->xdg_toplevel)
+        return VO_NOTAVAIL;
+    if (wl->fullscreen)
+        zxdg_toplevel_v6_unset_fullscreen(wl->xdg_toplevel);
+    else
+        zxdg_toplevel_v6_set_fullscreen(wl->xdg_toplevel, NULL);
+    return VO_TRUE;
+}
 
-    struct vo_wayland_output *output;
-    struct vo_wayland_output *first_output = NULL;
-    struct vo_wayland_output *fsscreen_output = NULL;
+static int update_window_title(struct vo_wayland_state *wl, char *title)
+{
+    if (!wl->xdg_toplevel)
+        return VO_NOTAVAIL;
+    zxdg_toplevel_v6_set_title(wl->xdg_toplevel, title);
+    return VO_TRUE;
+}
 
-    if (opts->fsscreen_id >= 0) {
-        wl_list_for_each_reverse(output, &wl->display.output_list, link) {
-            if (!output || !output->width)
-                continue;
+static void check_dnd_fd(struct vo_wayland_state *wl)
+{
+    if (wl->dnd_fd == -1)
+        return;
 
-            if (opts->fsscreen_id == screen_id)
-                fsscreen_output = output;
+    struct pollfd fdp = { wl->dnd_fd, POLLIN | POLLERR | POLLHUP, 0 };
+    if (poll(&fdp, 1, 0) <= 0)
+        return;
 
-            screen_id++;
+    if (fdp.revents & POLLIN) {
+        ptrdiff_t offset = 0;
+        size_t data_read = 0;
+        const size_t chunk_size = 1;
+        uint8_t *buffer = ta_zalloc_size(wl, chunk_size);
+        if (!buffer)
+            goto end;
+
+        while ((data_read = read(wl->dnd_fd, buffer + offset, chunk_size)) > 0) {
+            offset += data_read;
+            buffer = ta_realloc_size(wl, buffer, offset + chunk_size);
+            memset(buffer + offset, 0, chunk_size);
+            if (!buffer)
+                goto end;
         }
-    }
 
-    if (fsscreen_output) {
-        wl->display.fs_output = fsscreen_output->output;
-        screenrc->x1 = fsscreen_output->width;
-        screenrc->y1 = fsscreen_output->height;
+        MP_VERBOSE(wl, "Read %td bytes from the DND fd\n", offset);
+
+        struct bstr file_list = bstr0(buffer);
+        mp_event_drop_mime_data(wl->vo->input_ctx, wl->dnd_mime_type,
+                                file_list, wl->dnd_action);
+        talloc_free(buffer);
+end:
+        wl_data_offer_finish(wl->dnd_offer);
+        talloc_free(wl->dnd_mime_type);
+        wl->dnd_mime_type = NULL;
+        wl->dnd_mime_score = 0;
     }
-    else {
-        wl->display.fs_output = NULL; /* current output is always 0 */
 
-        if (first_output) {
-            screenrc->x1 = wl->display.current_output->width;
-            screenrc->y1 = wl->display.current_output->height;
-        }
+    if (fdp.revents & (POLLIN | POLLERR | POLLHUP)) {
+        close(wl->dnd_fd);
+        wl->dnd_fd = -1;
     }
+}
 
-    wl->window.fs_width = screenrc->x1;
-    wl->window.fs_height = screenrc->y1;
+static char **get_displays_spanned(struct vo_wayland_state *wl)
+{
+    char **names = NULL;
+    int displays_spanned = 0;
+    struct vo_wayland_output *output;
+    wl_list_for_each(output, &wl->output_list, link) {
+        if (output->has_surface)
+            MP_TARRAY_APPEND(NULL, names, displays_spanned,
+                             talloc_strdup(NULL, output->model));
+    }
+    MP_TARRAY_APPEND(NULL, names, displays_spanned, NULL);
+    return names;
 }
 
 int vo_wayland_control(struct vo *vo, int *events, int request, void *arg)
 {
-    struct vo_wayland_state *wl = vo->wayland;
-    wl_display_dispatch_pending(wl->display.display);
+    struct vo_wayland_state *wl = vo->wl;
+    wl_display_dispatch_pending(wl->display);
 
     switch (request) {
-    case VOCTRL_CHECK_EVENTS:
-        *events |= wl->window.events;
-        wl->window.events = 0;
+    case VOCTRL_CHECK_EVENTS: {
+        check_dnd_fd(wl);
+        *events |= wl->pending_vo_events;
+        wl->pending_vo_events = 0;
         return VO_TRUE;
-    case VOCTRL_FULLSCREEN:
-        vo_wayland_fullscreen(vo);
+    }
+    case VOCTRL_GET_FULLSCREEN: {
+        *(int *)arg = wl->fullscreen;
         return VO_TRUE;
-    case VOCTRL_ONTOP:
-        vo_wayland_ontop(vo);
+    }
+    case VOCTRL_GET_DISPLAY_NAMES: {
+        *(char ***)arg = get_displays_spanned(wl);
         return VO_TRUE;
-    case VOCTRL_GET_UNFS_WINDOW_SIZE: {
-        int *s = arg, scale = 1;
-        if (wl->display.current_output)
-            scale = wl->display.current_output->scale;
-        s[0] = scale*wl->window.width;
-        s[1] = scale*wl->window.height;
+    }
+    case VOCTRL_PAUSE: {
+        wl_callback_destroy(wl->frame_callback);
+        wl->frame_callback = NULL;
+        vo_disable_external_renderloop(wl->vo);
         return VO_TRUE;
     }
-    case VOCTRL_SET_UNFS_WINDOW_SIZE: {
+    case VOCTRL_RESUME: {
+        vo_enable_external_renderloop(wl->vo);
+        frame_callback(wl, NULL, 0);
+        return VO_TRUE;
+    }
+    case VOCTRL_GET_UNFS_WINDOW_SIZE: {
         int *s = arg;
-        if (!wl->window.is_fullscreen)
-            schedule_resize(wl, 0, s[0], s[1]);
+        s[0] = mp_rect_w(wl->geometry)*wl->scaling;
+        s[1] = mp_rect_h(wl->geometry)*wl->scaling;
         return VO_TRUE;
     }
-    case VOCTRL_SET_CURSOR_VISIBILITY:
-        if (*(bool *)arg) {
-            if (!wl->cursor.visible)
-                show_cursor(wl);
-        }
-        else {
-            if (wl->cursor.visible)
-                hide_cursor(wl);
+    case VOCTRL_SET_UNFS_WINDOW_SIZE: {
+        int *s = arg;
+        if (!wl->fullscreen) {
+            wl->geometry.x0 = 0;
+            wl->geometry.y0 = 0;
+            wl->geometry.x1 = s[0]/wl->scaling;
+            wl->geometry.y1 = s[1]/wl->scaling;
+            wl->pending_vo_events |= VO_EVENT_RESIZE;
         }
-        wl->cursor.visible = *(bool *)arg;
-        return VO_TRUE;
-    case VOCTRL_UPDATE_WINDOW_TITLE:
-        window_set_title(wl, (char*) arg);
         return VO_TRUE;
+    }
     case VOCTRL_GET_DISPLAY_FPS: {
-        if (!wl->display.current_output)
-            break;
-
-        // refresh rate is stored in milli-Hertz (mHz)
-        double fps = wl->display.current_output->refresh_rate / 1000.0f;
-        *(double*) arg = fps;
+        if (!wl->current_output)
+            return VO_NOTAVAIL;
+        *(double *)arg = wl->current_output->refresh_rate;
         return VO_TRUE;
     }
+    case VOCTRL_UPDATE_WINDOW_TITLE:
+        return update_window_title(wl, (char *)arg);
+    case VOCTRL_FULLSCREEN:
+        return toggle_fullscreen(wl);
+    case VOCTRL_SET_CURSOR_VISIBILITY:
+        return set_cursor_visibility(wl, *(bool *)arg);
+    case VOCTRL_BORDER:
+        return set_border_decorations(wl, vo->opts->border);
+    case VOCTRL_KILL_SCREENSAVER:
+        return set_screensaver_inhibitor(wl, true);
+    case VOCTRL_RESTORE_SCREENSAVER:
+        return set_screensaver_inhibitor(wl, false);
     }
-    return VO_NOTIMPL;
-}
 
-bool vo_wayland_config(struct vo *vo)
-{
-    struct vo_wayland_state *wl = vo->wayland;
-
-    struct mp_rect screenrc;
-    vo_wayland_update_screeninfo(vo, &screenrc);
-
-    struct vo_win_geometry geo;
-    vo_calc_window_geometry(vo, &screenrc, &geo);
-    vo_apply_window_geometry(vo, &geo);
-
-    wl->window.p_width = vo->dwidth;
-    wl->window.p_height = vo->dheight;
-    wl->window.aspect = vo->dwidth / (float) MPMAX(vo->dheight, 1);
-
-    wl->window.width = vo->dwidth;
-    wl->window.height = vo->dheight;
-    vo_wayland_fullscreen(vo);
-
-    return true;
-}
-
-void vo_wayland_request_frame(struct vo *vo, void *data, vo_wayland_frame_cb cb)
-{
-    struct vo_wayland_state *wl = vo->wayland;
-    wl->frame.data = data;
-    wl->frame.function = cb;
-    MP_DBG(wl, "restart frame callback\n");
-    frame_callback(wl, NULL, 0);
+    return VO_NOTIMPL;
 }
 
 void vo_wayland_wakeup(struct vo *vo)
 {
-    struct vo_wayland_state *wl = vo->wayland;
+    struct vo_wayland_state *wl = vo->wl;
     (void)write(wl->wakeup_pipe[1], &(char){0}, 1);
 }
 
 void vo_wayland_wait_events(struct vo *vo, int64_t until_time_us)
 {
-    struct vo_wayland_state *wl = vo->wayland;
-    struct wl_display *dp = wl->display.display;
+    struct vo_wayland_state *wl = vo->wl;
+    struct wl_display *display = wl->display;
+
+    if (wl->display_fd == -1)
+        return;
 
     struct pollfd fds[2] = {
-        {.fd = wl->display.display_fd, .events = POLLIN },
-        {.fd = wl->wakeup_pipe[0],     .events = POLLIN },
+        {.fd = wl->display_fd,     .events = POLLIN },
+        {.fd = wl->wakeup_pipe[0], .events = POLLIN },
     };
 
     int64_t wait_us = until_time_us - mp_time_us();
     int timeout_ms = MPCLAMP((wait_us + 999) / 1000, 0, 10000);
 
-    wl_display_dispatch_pending(dp);
-    wl_display_flush(dp);
+    wl_display_dispatch_pending(display);
+    wl_display_flush(display);
 
     poll(fds, 2, timeout_ms);
 
     if (fds[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
-        MP_FATAL(wl, "error occurred on the display fd: "
-                     "closing file descriptor\n");
-        close(wl->display.display_fd);
+        MP_FATAL(wl, "Error occurred on the display fd, closing\n");
+        close(wl->display_fd);
+        wl->display_fd = -1;
         mp_input_put_key(vo->input_ctx, MP_KEY_CLOSE_WIN);
     }
 
     if (fds[0].revents & POLLIN)
-        wl_display_dispatch(dp);
+        wl_display_dispatch(display);
 
     if (fds[1].revents & POLLIN)
         mp_flush_wakeup_pipe(wl->wakeup_pipe[0]);
diff --git a/video/out/wayland_common.h b/video/out/wayland_common.h
index 4bb90d6..4911009 100644
--- a/video/out/wayland_common.h
+++ b/video/out/wayland_common.h
@@ -1,6 +1,5 @@
 /*
  * This file is part of mpv video player.
- * Copyright © 2013 Alexander Preisinger <alexander.preisinger@gmail.com>
  *
  * mpv is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
@@ -19,133 +18,96 @@
 #ifndef MPLAYER_WAYLAND_COMMON_H
 #define MPLAYER_WAYLAND_COMMON_H
 
-#include <stdint.h>
-#include <stdbool.h>
 #include <wayland-client.h>
 #include <wayland-cursor.h>
 #include <xkbcommon/xkbcommon.h>
 
-#include "config.h"
-
-#if HAVE_GL_WAYLAND
-#include <wayland-egl.h>
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
-#endif
-
-struct vo;
+#include "vo.h"
+#include "input/event.h"
 
 struct vo_wayland_output {
-    uint32_t id; /* unique name */
+    struct vo_wayland_state *wl;
+    uint32_t id;
     struct wl_output *output;
+    struct mp_rect geometry;
+    int phys_width;
+    int phys_height;
+    int scale;
     uint32_t flags;
-    int32_t width;
-    int32_t height;
-    int32_t scale;
-    int32_t refresh_rate; // fps (mHz)
-    const char *make;
-    const char *model;
+    double refresh_rate;
+    char *make;
+    char *model;
+    bool has_surface;
     struct wl_list link;
 };
 
-typedef void (*vo_wayland_frame_cb)(void *data, uint32_t time);
-
 struct vo_wayland_state {
-    struct vo *vo;
-    struct mp_log* log;
+    struct mp_log        *log;
+    struct vo            *vo;
+    struct wl_display    *display;
+    struct wl_shm        *shm;
+    struct wl_compositor *compositor;
+    struct wl_registry   *registry;
+
+    /* State */
+    struct mp_rect geometry;
+    struct mp_rect window_size;
+    float aspect_ratio;
+    bool fullscreen;
+    bool configured;
     int wakeup_pipe[2];
-
-    struct {
-        void *data;
-        vo_wayland_frame_cb function;
-        struct wl_callback *callback;
-    } frame;
-
-#if HAVE_GL_WAYLAND
-    struct {
-        EGLSurface egl_surface;
-
-        struct wl_egl_window *egl_window;
-
-        struct {
-            EGLDisplay dpy;
-            EGLContext ctx;
-            EGLConfig conf;
-        } egl;
-    } egl_context;
-#endif
-
-    struct {
-        int fd;
-        struct wl_display *display;
-        struct wl_registry *registry;
-        struct wl_compositor *compositor;
-        struct wl_shell *shell;
-
-        struct wl_list output_list;
-        struct wl_output *fs_output; /* fullscreen output */
-        struct vo_wayland_output *current_output;
-
-        int display_fd;
-
-        struct wl_shm *shm;
-
-        struct wl_subcompositor *subcomp;
-    } display;
-
-    struct {
-        int32_t width;    // current size of the window
-        int32_t height;
-        int32_t p_width;  // previous sizes for leaving fullscreen
-        int32_t p_height;
-        int32_t sh_width; // sheduled width for resizing
-        int32_t sh_height;
-        int32_t sh_x;     // x, y calculated with the drag edges for moving
-        int32_t sh_y;
-        float aspect;
-
-        bool is_fullscreen; // don't keep aspect ratio in fullscreen mode
-        int32_t fs_width;   // fullscreen sizes
-        int32_t fs_height;
-
-        struct wl_surface *video_surface;
-        int32_t mouse_x; // mouse position inside the surface
-        int32_t mouse_y;
-        struct wl_shell_surface *shell_surface;
-        int events; /* mplayer events (VO_EVENT_RESIZE) */
-    } window;
-
-    struct {
-        struct wl_cursor *default_cursor;
-        struct wl_cursor_theme *theme;
-        struct wl_surface *surface;
-
-        /* pointer for fading out */
-        bool visible;
-        struct wl_pointer *pointer;
-        uint32_t serial;
-    } cursor;
-
-    struct {
-        struct wl_seat *seat;
-        struct wl_keyboard *keyboard;
-        struct wl_pointer *pointer;
-
-        struct {
-            struct xkb_context *context;
-            struct xkb_keymap *keymap;
-            struct xkb_state *state;
-        } xkb;
-    } input;
+    int pending_vo_events;
+    int mouse_x;
+    int mouse_y;
+    int scaling;
+    int touch_entries;
+    uint32_t pointer_id;
+    int display_fd;
+    struct wl_callback       *frame_callback;
+    struct wl_list            output_list;
+    struct vo_wayland_output *current_output;
+
+    /* Shell */
+    struct wl_surface       *surface;
+    struct zxdg_shell_v6    *shell;
+    struct zxdg_toplevel_v6 *xdg_toplevel;
+    struct zxdg_surface_v6  *xdg_surface;
+    struct org_kde_kwin_server_decoration_manager *server_decoration_manager;
+    struct org_kde_kwin_server_decoration *server_decoration;
+    struct zwp_idle_inhibit_manager_v1 *idle_inhibit_manager;
+    struct zwp_idle_inhibitor_v1 *idle_inhibitor;
+
+    /* Input */
+    struct wl_seat     *seat;
+    struct wl_pointer  *pointer;
+    struct wl_touch    *touch;
+    struct wl_keyboard *keyboard;
+    struct xkb_context *xkb_context;
+    struct xkb_keymap  *xkb_keymap;
+    struct xkb_state   *xkb_state;
+
+    /* DND */
+    struct wl_data_device_manager *dnd_devman;
+    struct wl_data_device *dnd_ddev;
+    struct wl_data_offer *dnd_offer;
+    enum mp_dnd_action dnd_action;
+    char *dnd_mime_type;
+    int dnd_mime_score;
+    int dnd_fd;
+
+    /* Cursor */
+    struct wl_cursor_theme *cursor_theme;
+    struct wl_cursor       *default_cursor;
+    struct wl_surface      *cursor_surface;
+    int                     allocated_cursor_scale;
 };
 
 int vo_wayland_init(struct vo *vo);
-void vo_wayland_uninit(struct vo *vo);
-bool vo_wayland_config(struct vo *vo);
+int vo_wayland_reconfig(struct vo *vo);
 int vo_wayland_control(struct vo *vo, int *events, int request, void *arg);
+void vo_wayland_check_events(struct vo *vo);
+void vo_wayland_uninit(struct vo *vo);
 void vo_wayland_wakeup(struct vo *vo);
 void vo_wayland_wait_events(struct vo *vo, int64_t until_time_us);
-void vo_wayland_request_frame(struct vo *vo, void *data, vo_wayland_frame_cb cb);
 
 #endif /* MPLAYER_WAYLAND_COMMON_H */
-
diff --git a/video/out/win32/exclusive_hack.c b/video/out/win32/exclusive_hack.c
deleted file mode 100644
index 668dfd5..0000000
--- a/video/out/win32/exclusive_hack.c
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <windows.h>
-#include <winternl.h>
-#include <pthread.h>
-
-#include "exclusive_hack.h"
-
-// Missing NT API definitions
-typedef enum _MP_MUTANT_INFORMATION_CLASS {
-    MpMutantBasicInformation
-} MP_MUTANT_INFORMATION_CLASS;
-#define MUTANT_INFORMATION_CLASS MP_MUTANT_INFORMATION_CLASS
-#define MutantBasicInformation MpMutantBasicInformation
-
-typedef struct _MP_MUTANT_BASIC_INFORMATION {
-    LONG CurrentCount;
-    BOOLEAN OwnedByCaller;
-    BOOLEAN AbandonedState;
-} MP_MUTANT_BASIC_INFORMATION;
-#define MUTANT_BASIC_INFORMATION MP_MUTANT_BASIC_INFORMATION
-
-static pthread_once_t internal_api_load_ran = PTHREAD_ONCE_INIT;
-static bool internal_api_loaded = false;
-
-static HANDLE excl_mode_mutex;
-static NTSTATUS (NTAPI *pNtQueryMutant)(HANDLE MutantHandle,
-    MUTANT_INFORMATION_CLASS MutantInformationClass, PVOID MutantInformation,
-    ULONG MutantInformationLength, PULONG ReturnLength);
-
-static void internal_api_load(void)
-{
-    HMODULE ntdll = GetModuleHandleW(L"ntdll.dll");
-    if (!ntdll)
-        return;
-    pNtQueryMutant = (void*)GetProcAddress(ntdll, "NtQueryMutant");
-    if (!pNtQueryMutant)
-        return;
-    excl_mode_mutex = OpenMutexW(MUTANT_QUERY_STATE, FALSE,
-        L"Local\\__DDrawExclMode__");
-    if (!excl_mode_mutex)
-        return;
-
-    internal_api_loaded = true;
-}
-
-bool mp_w32_is_in_exclusive_mode(void)
-{
-    pthread_once(&internal_api_load_ran, internal_api_load);
-    if (!internal_api_loaded)
-        return false;
-
-    // As far as we can tell, there is no way to know if a specific OpenGL
-    // program is being redirected by the DWM. It is possible, however, to
-    // know if some program on the computer is unredirected by the DWM, that
-    // is, whether some program is in exclusive fullscreen mode. Exclusive
-    // fullscreen programs acquire an undocumented mutex: __DDrawExclMode__. If
-    // this is acquired, it's probably by mpv. Even if it isn't, the penalty
-    // for incorrectly guessing true (dropped frames) is better than the
-    // penalty for incorrectly guessing false (tearing.)
-
-    // Testing this mutex is another problem. There is no public function for
-    // testing a mutex without attempting to acquire it, but that method won't
-    // work because if mpv is in fullscreen, the mutex will already be acquired
-    // by this thread (in ddraw.dll) and Windows will happily let it be
-    // acquired again. Instead, use the undocumented NtQueryMutant function to
-    // test the mutex.
-
-    // Note: SHQueryUserNotificationState uses this mutex internally, but it is
-    // probably not suitable because it sends a message to the shell instead of
-    // testing the mutex directly. mpv will check for exclusive mode once per
-    // frame, so if the shell is not running or not responding, it may cause
-    // performance issues.
-
-    MUTANT_BASIC_INFORMATION mbi;
-    NTSTATUS s = pNtQueryMutant(excl_mode_mutex, MutantBasicInformation, &mbi,
-        sizeof mbi, NULL);
-    if (!NT_SUCCESS(s))
-        return false;
-
-    return !mbi.CurrentCount;
-}
diff --git a/video/out/x11_common.h b/video/out/x11_common.h
index e69640c..1c00963 100644
--- a/video/out/x11_common.h
+++ b/video/out/x11_common.h
@@ -29,6 +29,11 @@
 
 #include "common/common.h"
 
+#include "config.h"
+#if !HAVE_GPL
+#error GPL only
+#endif
+
 struct vo;
 struct mp_log;