New upstream version 0.29.0

author: James Cowgill <jcowgill@debian.org> 2018-07-27 14:24:34 +0800
committer: James Cowgill <jcowgill@debian.org> 2018-07-27 14:24:34 +0800
commit: f4faf74f8747c113bd8c1f99e6b6fb1983f11e0d (patch)
tree: a9888a5b34d33fa31cc656c856d81333aa0e3ab3 /video/out
parent: d96cb5fac5258f82733a6e26aa212939f2ce991d (diff)
87 files changed, 6317 insertions, 2472 deletions
diff --git a/video/out/cocoa-cb/events_view.swift b/video/out/cocoa-cb/events_view.swift
new file mode 100644
index 0000000..7cc295f
--- /dev/null
+++ b/video/out/cocoa-cb/events_view.swift
@@ -0,0 +1,267 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import Cocoa
+
+class EventsView: NSView {
+
+    weak var cocoaCB: CocoaCB!
+    var mpv: MPVHelper! {
+        get { return cocoaCB == nil ? nil : cocoaCB.mpv }
+    }
+
+    var tracker: NSTrackingArea?
+    var hasMouseDown: Bool = false
+
+    override var isFlipped: Bool { return true }
+    override var acceptsFirstResponder: Bool { return true }
+
+
+    init(cocoaCB ccb: CocoaCB) {
+        cocoaCB = ccb
+        super.init(frame: NSMakeRect(0, 0, 960, 480))
+        autoresizingMask = [.viewWidthSizable, .viewHeightSizable]
+        wantsBestResolutionOpenGLSurface = true
+        register(forDraggedTypes: [NSFilenamesPboardType, NSURLPboardType])
+    }
+
+    required init?(coder: NSCoder) {
+        fatalError("init(coder:) has not been implemented")
+    }
+
+    override func updateTrackingAreas() {
+        if tracker != nil {
+            removeTrackingArea(tracker!)
+        }
+
+        tracker = NSTrackingArea(rect: bounds,
+            options: [.activeAlways, .mouseEnteredAndExited, .mouseMoved, .enabledDuringMouseDrag],
+            owner: self, userInfo: nil)
+        addTrackingArea(tracker!)
+
+        if containsMouseLocation() {
+            cocoa_put_key_with_modifiers(SWIFT_KEY_MOUSE_LEAVE, 0)
+        }
+    }
+
+    override func draggingEntered(_ sender: NSDraggingInfo) -> NSDragOperation {
+        guard let types = sender.draggingPasteboard().types else { return [] }
+        if types.contains(NSFilenamesPboardType) || types.contains(NSURLPboardType) {
+            return .copy
+        }
+        return []
+    }
+
+    override func performDragOperation(_ sender: NSDraggingInfo) -> Bool {
+        let pb = sender.draggingPasteboard()
+        guard let types = sender.draggingPasteboard().types else { return false }
+        if types.contains(NSFilenamesPboardType) {
+            if let files = pb.propertyList(forType: NSFilenamesPboardType) as? [Any] {
+                EventsResponder.sharedInstance().handleFilesArray(files)
+                return true
+            }
+        } else if types.contains(NSURLPboardType) {
+            if let url = pb.propertyList(forType: NSURLPboardType) as? [Any] {
+                EventsResponder.sharedInstance().handleFilesArray(url)
+                return true
+            }
+        }
+        return false
+    }
+
+    override func acceptsFirstMouse(for event: NSEvent?) -> Bool {
+        return true
+    }
+
+    override func becomeFirstResponder() -> Bool {
+        return true
+    }
+
+    override func resignFirstResponder() -> Bool {
+        return true
+    }
+
+    override func mouseEntered(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            cocoa_put_key_with_modifiers(SWIFT_KEY_MOUSE_ENTER, 0)
+        }
+    }
+
+    override func mouseExited(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            cocoa_put_key_with_modifiers(SWIFT_KEY_MOUSE_LEAVE, 0)
+        }
+        cocoaCB.window.hideTitleBar()
+    }
+
+    override func mouseMoved(with event: NSEvent) {
+        if mpv != nil && mpv.getBoolProperty("input-cursor") {
+            signalMouseMovement(event)
+        }
+        cocoaCB.window.showTitleBar()
+    }
+
+    override func mouseDragged(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseMovement(event)
+        }
+    }
+
+    override func mouseDown(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseDown(event)
+        }
+    }
+
+    override func mouseUp(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseUp(event)
+        }
+        cocoaCB.window.isMoving = false
+    }
+
+    override func rightMouseDown(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseDown(event)
+        }
+    }
+
+    override func rightMouseUp(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseUp(event)
+        }
+    }
+
+    override func otherMouseDown(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseDown(event)
+        }
+    }
+
+    override func otherMouseUp(with event: NSEvent) {
+        if mpv.getBoolProperty("input-cursor") {
+            signalMouseUp(event)
+        }
+    }
+
+    func signalMouseDown(_ event: NSEvent) {
+        signalMouseEvent(event, SWIFT_KEY_STATE_DOWN)
+        if event.clickCount > 1 {
+            signalMouseEvent(event, SWIFT_KEY_STATE_UP)
+        }
+    }
+
+    func signalMouseUp(_ event: NSEvent) {
+        signalMouseEvent(event, SWIFT_KEY_STATE_UP)
+    }
+
+    func signalMouseEvent(_ event: NSEvent, _ state: Int32) {
+        hasMouseDown = state == SWIFT_KEY_STATE_DOWN
+        let mpkey = getMpvButton(event)
+        cocoa_put_key_with_modifiers((mpkey | state), Int32(event.modifierFlags.rawValue));
+    }
+
+    func signalMouseMovement(_ event: NSEvent) {
+        var point = convert(event.locationInWindow, from: nil)
+        point = convertToBacking(point)
+        point.y = -point.y
+
+        cocoaCB.window.updateMovableBackground(point)
+        if !cocoaCB.window.isMoving {
+            mpv.setMousePosition(point)
+        }
+    }
+
+    func preciseScroll(_ event: NSEvent) {
+        var delta: Double
+        var cmd: Int32
+
+        if fabs(event.deltaY) >= fabs(event.deltaX) {
+            delta = Double(event.deltaY) * 0.1;
+            cmd = delta > 0 ? SWIFT_WHEEL_UP : SWIFT_WHEEL_DOWN;
+        } else {
+            delta = Double(event.deltaX) * 0.1;
+            cmd = delta > 0 ? SWIFT_WHEEL_RIGHT : SWIFT_WHEEL_LEFT;
+        }
+
+        mpv.putAxis(cmd, delta: fabs(delta))
+    }
+
+    override func scrollWheel(with event: NSEvent) {
+        if !mpv.getBoolProperty("input-cursor") {
+            return
+        }
+
+        if event.hasPreciseScrollingDeltas {
+            preciseScroll(event)
+        } else {
+            let modifiers = event.modifierFlags
+            let deltaX = modifiers.contains(.shift) ? event.scrollingDeltaY : event.scrollingDeltaX
+            let deltaY = modifiers.contains(.shift) ? event.scrollingDeltaX : event.scrollingDeltaY
+            var mpkey: Int32
+
+            if fabs(deltaY) >= fabs(deltaX) {
+                mpkey = deltaY > 0 ? SWIFT_WHEEL_UP : SWIFT_WHEEL_DOWN;
+            } else {
+                mpkey = deltaX > 0 ? SWIFT_WHEEL_RIGHT : SWIFT_WHEEL_LEFT;
+            }
+
+            cocoa_put_key_with_modifiers(mpkey, Int32(modifiers.rawValue))
+        }
+    }
+
+    func containsMouseLocation() -> Bool {
+        if cocoaCB == nil { return false }
+        var topMargin: CGFloat = 0.0
+        let menuBarHeight = NSApp.mainMenu!.menuBarHeight
+
+        if cocoaCB.window.isInFullscreen && (menuBarHeight > 0) {
+            topMargin = cocoaCB.window.titleBarHeight + 1 + menuBarHeight
+        }
+
+        var vF = window!.screen!.frame
+        vF.size.height -= topMargin
+
+        let vFW = window!.convertFromScreen(vF)
+        let vFV = convert(vFW, from: nil)
+        let pt = convert(window!.mouseLocationOutsideOfEventStream, from: nil)
+
+        var clippedBounds = bounds.intersection(vFV)
+        if !cocoaCB.window.isInFullscreen {
+            clippedBounds.origin.y += cocoaCB.window.titleBarHeight
+            clippedBounds.size.height -= cocoaCB.window.titleBarHeight
+        }
+        return clippedBounds.contains(pt)
+    }
+
+    func canHideCursor() -> Bool {
+        if cocoaCB.window == nil { return false }
+        return !hasMouseDown && containsMouseLocation() && window!.isKeyWindow
+    }
+
+    func getMpvButton(_ event: NSEvent) -> Int32 {
+        let buttonNumber = event.buttonNumber
+        switch (buttonNumber) {
+            case 0:  return SWIFT_MBTN_LEFT;
+            case 1:  return SWIFT_MBTN_RIGHT;
+            case 2:  return SWIFT_MBTN_MID;
+            case 3:  return SWIFT_MBTN_BACK;
+            case 4:  return SWIFT_MBTN_FORWARD;
+            default: return SWIFT_MBTN9 + Int32(buttonNumber - 5);
+        }
+    }
+}
diff --git a/video/out/cocoa-cb/video_layer.swift b/video/out/cocoa-cb/video_layer.swift
new file mode 100644
index 0000000..b389327
--- /dev/null
+++ b/video/out/cocoa-cb/video_layer.swift
@@ -0,0 +1,233 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import Cocoa
+import OpenGL.GL
+import OpenGL.GL3
+
+class VideoLayer: CAOpenGLLayer {
+
+    weak var cocoaCB: CocoaCB!
+    var mpv: MPVHelper! {
+        get { return cocoaCB == nil ? nil : cocoaCB.mpv }
+    }
+
+    let videoLock = NSLock()
+    let displayLock = NSLock()
+    var hasVideo: Bool = false
+    var needsFlip: Bool = false
+    var canDrawOffScreen: Bool = false
+    var cglContext: CGLContextObj? = nil
+    var surfaceSize: NSSize?
+
+    enum Draw: Int { case normal = 1, atomic, atomicEnd }
+    var draw: Draw = .normal
+
+    let queue: DispatchQueue = DispatchQueue(label: "io.mpv.queue.draw")
+
+    var needsICCUpdate: Bool = false {
+        didSet {
+            if needsICCUpdate == true {
+                update()
+            }
+        }
+    }
+
+    var inLiveResize: Bool = false {
+        didSet {
+            if inLiveResize {
+                isAsynchronous = true
+            }
+            update()
+        }
+    }
+
+    init(cocoaCB ccb: CocoaCB) {
+        cocoaCB = ccb
+        super.init()
+        autoresizingMask = [.layerWidthSizable, .layerHeightSizable]
+        backgroundColor = NSColor.black.cgColor
+
+        CGLCreateContext(copyCGLPixelFormat(forDisplayMask: 0), nil, &cglContext)
+        var i: GLint = 1
+        CGLSetParameter(cglContext!, kCGLCPSwapInterval, &i)
+        CGLSetCurrentContext(cglContext!)
+
+        mpv.initRender()
+        mpv.setRenderUpdateCallback(updateCallback, context: self)
+        mpv.setRenderControlCallback(cocoaCB.controlCallback, context: cocoaCB)
+    }
+
+    override init(layer: Any) {
+        let oldLayer = layer as! VideoLayer
+        cocoaCB = oldLayer.cocoaCB
+        super.init()
+    }
+
+    required init?(coder: NSCoder) {
+        fatalError("init(coder:) has not been implemented")
+    }
+
+    override func canDraw(inCGLContext ctx: CGLContextObj,
+                          pixelFormat pf: CGLPixelFormatObj,
+                          forLayerTime t: CFTimeInterval,
+                          displayTime ts: UnsafePointer<CVTimeStamp>?) -> Bool {
+        if inLiveResize == false {
+            isAsynchronous = false
+        }
+        return mpv != nil && cocoaCB.backendState == .initialized
+    }
+
+    override func draw(inCGLContext ctx: CGLContextObj,
+                       pixelFormat pf: CGLPixelFormatObj,
+                       forLayerTime t: CFTimeInterval,
+                       displayTime ts: UnsafePointer<CVTimeStamp>?) {
+        needsFlip = false
+        canDrawOffScreen = true
+        draw(ctx)
+    }
+
+    func draw(_ ctx: CGLContextObj) {
+        if draw.rawValue >= Draw.atomic.rawValue {
+             if draw == .atomic {
+                draw = .atomicEnd
+             } else {
+                atomicDrawingEnd()
+             }
+        }
+
+        updateSurfaceSize()
+        mpv.drawRender(surfaceSize!)
+        CGLFlushDrawable(ctx)
+
+        if needsICCUpdate {
+            needsICCUpdate = false
+            cocoaCB.updateICCProfile()
+        }
+    }
+
+    func updateSurfaceSize() {
+        var dims: [GLint] = [0, 0, 0, 0]
+        glGetIntegerv(GLenum(GL_VIEWPORT), &dims)
+        surfaceSize = NSMakeSize(CGFloat(dims[2]), CGFloat(dims[3]))
+
+        if NSEqualSizes(surfaceSize!, NSZeroSize) {
+            surfaceSize = bounds.size
+            surfaceSize!.width *= contentsScale
+            surfaceSize!.height *= contentsScale
+        }
+    }
+
+    func atomicDrawingStart() {
+        if draw == .normal && hasVideo {
+            NSDisableScreenUpdates()
+            draw = .atomic
+        }
+    }
+
+    func atomicDrawingEnd() {
+        if draw.rawValue >= Draw.atomic.rawValue {
+            NSEnableScreenUpdates()
+            draw = .normal
+        }
+    }
+
+    override func copyCGLPixelFormat(forDisplayMask mask: UInt32) -> CGLPixelFormatObj {
+        let glVersions: [CGLOpenGLProfile] = [
+            kCGLOGLPVersion_3_2_Core,
+            kCGLOGLPVersion_Legacy
+        ]
+
+        var pix: CGLPixelFormatObj?
+        var err: CGLError = CGLError(rawValue: 0)
+        var npix: GLint = 0
+
+        verLoop : for ver in glVersions {
+            var glAttributes: [CGLPixelFormatAttribute] = [
+                kCGLPFAOpenGLProfile, CGLPixelFormatAttribute(ver.rawValue),
+                kCGLPFAAccelerated,
+                kCGLPFADoubleBuffer,
+                kCGLPFABackingStore,
+                kCGLPFAAllowOfflineRenderers,
+                kCGLPFASupportsAutomaticGraphicsSwitching,
+                _CGLPixelFormatAttribute(rawValue: 0)
+            ]
+
+            for index in stride(from: glAttributes.count-2, through: 4, by: -1) {
+                err = CGLChoosePixelFormat(glAttributes, &pix, &npix)
+                if err == kCGLBadAttribute || err == kCGLBadPixelFormat || pix == nil {
+                    glAttributes.remove(at: index)
+                } else {
+                    break verLoop
+                }
+            }
+        }
+
+        if err != kCGLNoError || pix == nil {
+            let errS = String(cString: CGLErrorString(err))
+            mpv.sendError("Couldn't create CGL pixel format: \(errS) (\(err.rawValue))")
+            exit(1)
+        }
+        return pix!
+    }
+
+    override func copyCGLContext(forPixelFormat pf: CGLPixelFormatObj) -> CGLContextObj {
+        contentsScale = cocoaCB.window.backingScaleFactor
+        return cglContext!
+    }
+
+    let updateCallback: mpv_render_update_fn = { (ctx) in
+        let layer: VideoLayer = MPVHelper.bridge(ptr: ctx!)
+        layer.update()
+    }
+
+    override func display() {
+        displayLock.lock()
+        let isUpdate = needsFlip
+        super.display()
+        CATransaction.flush()
+        if isUpdate {
+            if !cocoaCB.window.occlusionState.contains(.visible) &&
+                needsFlip && canDrawOffScreen
+            {
+                CGLSetCurrentContext(cglContext!)
+                draw(cglContext!)
+            } else if needsFlip {
+                update()
+            }
+        }
+        displayLock.unlock()
+    }
+
+    func setVideo(_ state: Bool) {
+        videoLock.lock()
+        hasVideo = state
+        videoLock.unlock()
+    }
+
+    func update() {
+        queue.async {
+            self.videoLock.lock()
+            if !self.inLiveResize && self.hasVideo {
+                self.needsFlip = true
+                self.display()
+            }
+            self.videoLock.unlock()
+        }
+    }
+
+}
diff --git a/video/out/cocoa-cb/window.swift b/video/out/cocoa-cb/window.swift
new file mode 100644
index 0000000..907476f
--- /dev/null
+++ b/video/out/cocoa-cb/window.swift
@@ -0,0 +1,591 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import Cocoa
+
+class Window: NSWindow, NSWindowDelegate {
+
+    weak var cocoaCB: CocoaCB! = nil
+    var mpv: MPVHelper! {
+        get { return cocoaCB == nil ? nil : cocoaCB.mpv }
+    }
+
+    var targetScreen: NSScreen?
+    var previousScreen: NSScreen?
+    var currentScreen: NSScreen?
+    var unfScreen: NSScreen?
+
+    var unfsContentFrame: NSRect?
+    var isInFullscreen: Bool = false
+    var isAnimating: Bool = false
+    var isMoving: Bool = false
+    var forceTargetScreen: Bool = false
+
+    var keepAspect: Bool = true {
+        didSet {
+            if !isInFullscreen {
+                unfsContentFrame = convertToScreen(contentView!.frame)
+            }
+
+            if keepAspect {
+                contentAspectRatio = unfsContentFrame!.size
+            } else {
+                resizeIncrements = NSSize(width: 1.0, height: 1.0)
+            }
+        }
+    }
+
+    var border: Bool = true {
+        didSet { if !border { hideTitleBar() } }
+    }
+
+    var titleBarEffect: NSVisualEffectView?
+    var titleBar: NSView {
+        get { return (standardWindowButton(.closeButton)?.superview)! }
+    }
+    var titleBarHeight: CGFloat {
+        get { return NSWindow.frameRect(forContentRect: CGRect.zero, styleMask: .titled).size.height }
+    }
+    var titleButtons: [NSButton] {
+        get { return ([.closeButton, .miniaturizeButton, .zoomButton] as [NSWindowButton]).flatMap { standardWindowButton($0) } }
+    }
+
+    override var canBecomeKey: Bool { return true }
+    override var canBecomeMain: Bool { return true }
+
+    override var styleMask: NSWindowStyleMask {
+        get { return super.styleMask }
+        set {
+            let responder = firstResponder
+            let windowTitle = title
+            super.styleMask = newValue
+            makeFirstResponder(responder)
+            title = windowTitle
+        }
+    }
+
+    convenience init(contentRect: NSRect, screen: NSScreen?, view: NSView, cocoaCB ccb: CocoaCB) {
+        self.init(contentRect: contentRect,
+                  styleMask: [.titled, .closable, .miniaturizable, .resizable],
+                  backing: .buffered, defer: false, screen: screen)
+        cocoaCB = ccb
+        title = cocoaCB.title
+        minSize = NSMakeSize(160, 90)
+        collectionBehavior = .fullScreenPrimary
+        delegate = self
+        contentView!.addSubview(view)
+        view.frame = contentView!.frame
+
+        unfsContentFrame = convertToScreen(contentView!.frame)
+        targetScreen = screen!
+        currentScreen = screen!
+        unfScreen = screen!
+        initTitleBar()
+
+        if let app = NSApp as? Application {
+            app.menuBar.register(#selector(setHalfWindowSize), for: MPM_H_SIZE)
+            app.menuBar.register(#selector(setNormalWindowSize), for: MPM_N_SIZE)
+            app.menuBar.register(#selector(setDoubleWindowSize), for: MPM_D_SIZE)
+            app.menuBar.register(#selector(performMiniaturize(_:)), for: MPM_MINIMIZE)
+            app.menuBar.register(#selector(performZoom(_:)), for: MPM_ZOOM)
+        }
+    }
+
+    func initTitleBar() {
+        var f = contentView!.bounds
+        f.origin.y = f.size.height - titleBarHeight
+        f.size.height = titleBarHeight
+
+        styleMask.insert(.fullSizeContentView)
+        titleBar.alphaValue = 0
+        titlebarAppearsTransparent = true
+        titleBarEffect = NSVisualEffectView(frame: f)
+        titleBarEffect!.alphaValue = 0
+        titleBarEffect!.blendingMode = .withinWindow
+        titleBarEffect!.autoresizingMask = [.viewWidthSizable, .viewMinYMargin]
+
+        setTitleBarStyle(Int(mpv.macOpts!.macos_title_bar_style))
+        contentView!.addSubview(titleBarEffect!, positioned: .above, relativeTo: nil)
+    }
+
+    func setTitleBarStyle(_ style: Any) {
+        var effect: String
+
+        if style is Int {
+            switch style as! Int {
+            case 4:
+                effect = "auto"
+            case 3:
+                effect = "mediumlight"
+            case 2:
+                effect = "light"
+            case 1:
+                effect = "ultradark"
+            case 0: fallthrough
+            default:
+                effect = "dark"
+            }
+        } else {
+            effect = style as! String
+        }
+
+        if effect == "auto" {
+            let systemStyle = UserDefaults.standard.string(forKey: "AppleInterfaceStyle")
+            effect = systemStyle == nil ? "mediumlight" : "ultradark"
+        }
+
+        switch effect {
+        case "mediumlight":
+            appearance = NSAppearance(named: NSAppearanceNameVibrantLight)
+            titleBarEffect!.material = .titlebar
+            titleBarEffect!.state = .followsWindowActiveState
+        case "light":
+            appearance = NSAppearance(named: NSAppearanceNameVibrantLight)
+            titleBarEffect!.material = .light
+            titleBarEffect!.state = .active
+        case "ultradark":
+            appearance = NSAppearance(named: NSAppearanceNameVibrantDark)
+            titleBarEffect!.material = .titlebar
+            titleBarEffect!.state = .followsWindowActiveState
+        case "dark": fallthrough
+        default:
+            appearance = NSAppearance(named: NSAppearanceNameVibrantDark)
+            titleBarEffect!.material = .dark
+            titleBarEffect!.state = .active
+        }
+    }
+
+    func showTitleBar() {
+        if titleBarEffect == nil || (!border && !isInFullscreen) { return }
+        let loc = cocoaCB.view.convert(mouseLocationOutsideOfEventStream, from: nil)
+
+        titleButtons.forEach { $0.isHidden = false }
+        NSAnimationContext.runAnimationGroup({ (context) -> Void in
+            context.duration = 0.20
+            titleBar.animator().alphaValue = 1
+            if !isInFullscreen && !isAnimating {
+                titleBarEffect!.animator().alphaValue = 1
+            }
+        }, completionHandler: nil )
+
+        if loc.y > titleBarHeight {
+            hideTitleBarDelayed()
+        } else {
+            NSObject.cancelPreviousPerformRequests(withTarget: self, selector: #selector(hideTitleBar), object: nil)
+        }
+    }
+
+    func hideTitleBar() {
+        if titleBarEffect == nil { return }
+        if isInFullscreen && !isAnimating {
+            titleBarEffect!.alphaValue = 0
+            return
+        }
+        NSAnimationContext.runAnimationGroup({ (context) -> Void in
+            context.duration = 0.20
+            titleBar.animator().alphaValue = 0
+            titleBarEffect!.animator().alphaValue = 0
+        }, completionHandler: {
+            self.titleButtons.forEach { $0.isHidden = true }
+        })
+    }
+
+    func hideTitleBarDelayed() {
+        NSObject.cancelPreviousPerformRequests(withTarget: self,
+                                                 selector: #selector(hideTitleBar),
+                                                   object: nil)
+        perform(#selector(hideTitleBar), with: nil, afterDelay: 0.5)
+    }
+
+    override func toggleFullScreen(_ sender: Any?) {
+        if isAnimating {
+            return
+        }
+
+        isAnimating = true
+
+        targetScreen = cocoaCB.getTargetScreen(forFullscreen: !isInFullscreen)
+        if targetScreen == nil && previousScreen == nil {
+            targetScreen = screen
+        } else if targetScreen == nil {
+            targetScreen = previousScreen
+            previousScreen = nil
+        } else {
+            previousScreen = screen
+        }
+
+        if !isInFullscreen {
+            unfsContentFrame = convertToScreen(contentView!.frame)
+            unfScreen = screen
+        }
+        // move window to target screen when going to fullscreen
+        if !isInFullscreen && (targetScreen != screen) {
+            let frame = calculateWindowPosition(for: targetScreen!, withoutBounds: false)
+            setFrame(frame, display: true)
+        }
+
+        if mpv.getBoolProperty("native-fs") {
+            super.toggleFullScreen(sender)
+        } else {
+            if !isInFullscreen {
+                setToFullScreen()
+            }
+            else {
+                setToWindow()
+            }
+        }
+    }
+
+    func customWindowsToEnterFullScreen(for window: NSWindow) -> [NSWindow]? {
+        return [window]
+    }
+
+    func customWindowsToExitFullScreen(for window: NSWindow) -> [NSWindow]? {
+        return [window]
+    }
+
+    func window(_ window: NSWindow, startCustomAnimationToEnterFullScreenWithDuration duration: TimeInterval) {
+        cocoaCB.view.layerContentsPlacement = .scaleProportionallyToFit
+        hideTitleBar()
+        NSAnimationContext.runAnimationGroup({ (context) -> Void in
+            context.duration = getFsAnimationDuration(duration - 0.05)
+            window.animator().setFrame(targetScreen!.frame, display: true)
+        }, completionHandler: { })
+    }
+
+    func window(_ window: NSWindow, startCustomAnimationToExitFullScreenWithDuration duration: TimeInterval) {
+        let newFrame = calculateWindowPosition(for: targetScreen!, withoutBounds: targetScreen == screen)
+        let intermediateFrame = aspectFit(rect: newFrame, in: screen!.frame)
+        cocoaCB.view.layerContentsPlacement = .scaleProportionallyToFill
+        hideTitleBar()
+        setFrame(intermediateFrame, display: true)
+
+        NSAnimationContext.runAnimationGroup({ (context) -> Void in
+            context.duration = getFsAnimationDuration(duration - 0.05)
+            window.animator().setFrame(newFrame, display: true)
+        }, completionHandler: { })
+    }
+
+    func windowDidEnterFullScreen(_ notification: Notification) {
+        isInFullscreen = true
+        cocoaCB.flagEvents(VO_EVENT_FULLSCREEN_STATE)
+        cocoaCB.updateCusorVisibility()
+        endAnimation(frame)
+        showTitleBar()
+    }
+
+    func windowDidExitFullScreen(_ notification: Notification) {
+        isInFullscreen = false
+        cocoaCB.flagEvents(VO_EVENT_FULLSCREEN_STATE)
+        endAnimation(calculateWindowPosition(for: targetScreen!, withoutBounds: targetScreen == screen))
+        cocoaCB.view.layerContentsPlacement = .scaleProportionallyToFit
+    }
+
+    func windowDidFailToEnterFullScreen(_ window: NSWindow) {
+        let newFrame = calculateWindowPosition(for: targetScreen!, withoutBounds: targetScreen == screen)
+        setFrame(newFrame, display: true)
+        endAnimation()
+    }
+
+    func windowDidFailToExitFullScreen(_ window: NSWindow) {
+        let newFrame = targetScreen!.frame
+        setFrame(newFrame, display: true)
+        endAnimation()
+        cocoaCB.view.layerContentsPlacement = .scaleProportionallyToFit
+    }
+
+    func endAnimation(_ newFrame: NSRect = NSZeroRect) {
+        if !NSEqualRects(newFrame, NSZeroRect) {
+            NSAnimationContext.runAnimationGroup({ (context) -> Void in
+                context.duration = 0.01
+                self.animator().setFrame(newFrame, display: true)
+            }, completionHandler: nil )
+        }
+
+        isAnimating = false
+        cocoaCB.layer.update()
+        cocoaCB.checkShutdown()
+    }
+
+    func setToFullScreen() {
+        styleMask.insert(.fullScreen)
+        NSApp.presentationOptions = [.autoHideMenuBar, .autoHideDock]
+        setFrame(targetScreen!.frame, display: true)
+        endAnimation()
+        isInFullscreen = true
+        cocoaCB.flagEvents(VO_EVENT_FULLSCREEN_STATE)
+        cocoaCB.layer.update()
+    }
+
+    func setToWindow() {
+        let newFrame = calculateWindowPosition(for: targetScreen!, withoutBounds: targetScreen == screen)
+        NSApp.presentationOptions = []
+        setFrame(newFrame, display: true)
+        styleMask.remove(.fullScreen)
+        endAnimation()
+        isInFullscreen = false
+        cocoaCB.flagEvents(VO_EVENT_FULLSCREEN_STATE)
+        cocoaCB.layer.update()
+    }
+
+    func getFsAnimationDuration(_ def: Double) -> Double{
+        let duration = mpv.getStringProperty("macos-fs-animation-duration") ?? "default"
+        if duration == "default" {
+            return def
+        } else {
+            return Double(duration)!/1000
+        }
+    }
+
+    func setOnTop(_ state: Bool, _ ontopLevel: Any) {
+        if state {
+            if ontopLevel is Int {
+                switch ontopLevel as! Int {
+                case -1:
+                    level = Int(CGWindowLevelForKey(.floatingWindow))
+                case -2:
+                    level = Int(CGWindowLevelForKey(.statusWindow))+1
+                default:
+                    level = ontopLevel as! Int
+                }
+            } else {
+                switch ontopLevel as! String {
+                case "window":
+                    level = Int(CGWindowLevelForKey(.floatingWindow))
+                case "system":
+                    level = Int(CGWindowLevelForKey(.statusWindow))+1
+                default:
+                    level = Int(ontopLevel as! String)!
+                }
+            }
+            collectionBehavior.remove(.transient)
+            collectionBehavior.insert(.managed)
+        } else {
+            level = Int(CGWindowLevelForKey(.normalWindow))
+        }
+    }
+
+    func updateMovableBackground(_ pos: NSPoint) {
+        if !isInFullscreen {
+            isMovableByWindowBackground = mpv.canBeDraggedAt(pos)
+        } else {
+            isMovableByWindowBackground = false
+        }
+    }
+
+    func updateFrame(_ rect: NSRect) {
+        if rect != frame {
+            let cRect = frameRect(forContentRect: rect)
+            unfsContentFrame = rect
+            setFrame(cRect, display: true)
+        }
+    }
+
+    func updateSize(_ size: NSSize) {
+        if size != contentView!.frame.size {
+            let newContentFrame = centeredContentSize(for: frame, size: size)
+            if !isInFullscreen {
+                updateFrame(newContentFrame)
+            } else {
+                unfsContentFrame = newContentFrame
+            }
+        }
+    }
+
+    override func setFrame(_ frameRect: NSRect, display flag: Bool) {
+        let newFrame = !isAnimating && isInFullscreen ? targetScreen!.frame :
+                                                        frameRect
+        super.setFrame(newFrame, display: flag)
+
+        if keepAspect {
+            contentAspectRatio = unfsContentFrame!.size
+        }
+    }
+
+    func centeredContentSize(for rect: NSRect, size sz: NSSize) -> NSRect {
+        let cRect = contentRect(forFrameRect: rect)
+        let dx = (cRect.size.width  - sz.width)  / 2
+        let dy = (cRect.size.height - sz.height) / 2
+        return NSInsetRect(cRect, dx, dy)
+    }
+
+    func aspectFit(rect r: NSRect, in rTarget: NSRect) -> NSRect {
+        var s = rTarget.width / r.width;
+        if r.height*s > rTarget.height {
+            s = rTarget.height / r.height
+        }
+        let w = r.width * s
+        let h = r.height * s
+        return NSRect(x: rTarget.midX - w/2, y: rTarget.midY - h/2, width: w, height: h)
+    }
+
+    func calculateWindowPosition(for tScreen: NSScreen, withoutBounds: Bool) -> NSRect {
+        var newFrame = frameRect(forContentRect: unfsContentFrame!)
+        let targetFrame = tScreen.frame
+        let targetVisibleFrame = tScreen.visibleFrame
+        let unfsScreenFrame = unfScreen!.frame
+        let visibleWindow = NSIntersectionRect(unfsScreenFrame, newFrame)
+
+        // calculate visible area of every side
+        let left = newFrame.origin.x - unfsScreenFrame.origin.x
+        let right = unfsScreenFrame.size.width -
+            (newFrame.origin.x - unfsScreenFrame.origin.x + newFrame.size.width)
+        let bottom = newFrame.origin.y - unfsScreenFrame.origin.y
+        let top = unfsScreenFrame.size.height -
+            (newFrame.origin.y - unfsScreenFrame.origin.y + newFrame.size.height)
+
+        // normalize visible areas, decide which one to take horizontal/vertical
+        var xPer = (unfsScreenFrame.size.width - visibleWindow.size.width)
+        var yPer = (unfsScreenFrame.size.height - visibleWindow.size.height)
+        if xPer != 0 { xPer = (left >= 0 || right < 0 ? left : right) / xPer }
+        if yPer != 0 { yPer = (bottom >= 0 || top < 0 ? bottom : top) / yPer }
+
+        // calculate visible area for every side for target screen
+        let xNewLeft = targetFrame.origin.x +
+            (targetFrame.size.width - visibleWindow.size.width) * xPer
+        let xNewRight = targetFrame.origin.x + targetFrame.size.width -
+            (targetFrame.size.width - visibleWindow.size.width) * xPer - newFrame.size.width
+        let yNewBottom = targetFrame.origin.y +
+            (targetFrame.size.height - visibleWindow.size.height) * yPer
+        let yNewTop = targetFrame.origin.y + targetFrame.size.height -
+            (targetFrame.size.height - visibleWindow.size.height) * yPer - newFrame.size.height
+
+        // calculate new coordinates, decide which one to take horizontal/vertical
+        newFrame.origin.x = left >= 0 || right < 0 ? xNewLeft : xNewRight
+        newFrame.origin.y = bottom >= 0 || top < 0 ? yNewBottom : yNewTop
+
+        // don't place new window on top of a visible menubar
+        let topMar = targetFrame.size.height -
+            (newFrame.origin.y - targetFrame.origin.y + newFrame.size.height)
+        let menuBarHeight = targetFrame.size.height -
+            (targetVisibleFrame.size.height + targetVisibleFrame.origin.y)
+        if topMar < menuBarHeight {
+            newFrame.origin.y -= top - menuBarHeight
+        }
+
+        if withoutBounds {
+            return newFrame
+        }
+
+        // screen bounds right and left
+        if newFrame.origin.x + newFrame.size.width > targetFrame.origin.x + targetFrame.size.width {
+            newFrame.origin.x = targetFrame.origin.x + targetFrame.size.width - newFrame.size.width
+        }
+        if newFrame.origin.x < targetFrame.origin.x {
+            newFrame.origin.x = targetFrame.origin.x
+        }
+
+        // screen bounds top and bottom
+        if newFrame.origin.y + newFrame.size.height > targetFrame.origin.y + targetFrame.size.height {
+            newFrame.origin.y = targetFrame.origin.y + targetFrame.size.height - newFrame.size.height
+        }
+        if newFrame.origin.y < targetFrame.origin.y {
+            newFrame.origin.y = targetFrame.origin.y
+        }
+        return newFrame
+    }
+
+    override func constrainFrameRect(_ frameRect: NSRect, to tScreen: NSScreen?) -> NSRect {
+        if (isAnimating && !isInFullscreen) || (!isAnimating && isInFullscreen) {
+            return frameRect
+        }
+
+        var nf: NSRect = frameRect
+        let ts: NSScreen = tScreen ?? screen ?? NSScreen.main()!
+        let of: NSRect = frame
+        let vf: NSRect = (isAnimating ? targetScreen! : ts).visibleFrame
+        let ncf: NSRect = contentRect(forFrameRect: nf)
+
+        // screen bounds top and bottom
+        if NSMaxY(nf) > NSMaxY(vf) {
+            nf.origin.y = NSMaxY(vf) - NSHeight(nf)
+        }
+        if NSMaxY(ncf) < NSMinY(vf) {
+            nf.origin.y = NSMinY(vf) + NSMinY(ncf) - NSMaxY(ncf)
+        }
+
+        // screen bounds right and left
+        if NSMinX(nf) > NSMaxX(vf) {
+            nf.origin.x = NSMaxX(vf) - NSWidth(nf)
+        }
+        if NSMaxX(nf) < NSMinX(vf) {
+            nf.origin.x = NSMinX(vf)
+        }
+
+        if NSHeight(nf) < NSHeight(vf) && NSHeight(of) > NSHeight(vf) && !isInFullscreen {
+            // If the window height is smaller than the visible frame, but it was
+            // bigger previously recenter the smaller window vertically. This is
+            // needed to counter the 'snap to top' behaviour.
+            nf.origin.y = (NSHeight(vf) - NSHeight(nf)) / 2
+        }
+        return nf
+    }
+
+    func setNormalWindowSize() { setWindowScale(1.0) }
+    func setHalfWindowSize()   { setWindowScale(0.5) }
+    func setDoubleWindowSize() { setWindowScale(2.0) }
+
+    func setWindowScale(_ scale: Double) {
+        mpv.commandAsync(["osd-auto", "set", "window-scale", "\(scale)"])
+    }
+
+    func windowDidChangeScreen(_ notification: Notification) {
+        if screen == nil {
+            return
+        }
+        if !isAnimating && (currentScreen != screen) {
+            previousScreen = screen
+        }
+        if currentScreen != screen {
+            cocoaCB.updateDisplaylink()
+        }
+        currentScreen = screen
+    }
+
+    func windowDidChangeScreenProfile(_ notification: Notification) {
+        cocoaCB.layer.needsICCUpdate = true
+    }
+
+    func windowDidChangeBackingProperties(_ notification: Notification) {
+        cocoaCB.layer.contentsScale = backingScaleFactor
+    }
+
+    func windowWillStartLiveResize(_ notification: Notification) {
+        cocoaCB.layer.inLiveResize = true
+    }
+
+    func windowDidEndLiveResize(_ notification: Notification) {
+        cocoaCB.layer.inLiveResize = false
+    }
+
+    func windowShouldClose(_ sender: Any) -> Bool {
+        cocoa_put_key(SWIFT_KEY_CLOSE_WIN)
+        return false
+    }
+
+    func windowDidResignKey(_ notification: Notification) {
+        cocoaCB.setCursorVisiblility(true)
+    }
+
+    func windowDidBecomeKey(_ notification: Notification) {
+        cocoaCB.updateCusorVisibility()
+    }
+
+    func windowWillMove(_ notification: Notification) {
+        isMoving = true
+    }
+}
diff --git a/video/out/cocoa/window.m b/video/out/cocoa/window.m
index 2feaab9..3762987 100644
--- a/video/out/cocoa/window.m
+++ b/video/out/cocoa/window.m
@@ -45,7 +45,8 @@
 @synthesize targetScreen = _target_screen;
 @synthesize previousScreen = _previous_screen;
 @synthesize currentScreen = _current_screen;
-@synthesize unfScreen = _unf_Screen;
+@synthesize unfScreen = _unf_screen;
+
 - (id)initWithContentRect:(NSRect)content_rect
                 styleMask:(NSWindowStyleMask)style_mask
                   backing:(NSBackingStoreType)buffering_type
diff --git a/video/out/cocoa_cb_common.swift b/video/out/cocoa_cb_common.swift
new file mode 100644
index 0000000..a4aae9f
--- /dev/null
+++ b/video/out/cocoa_cb_common.swift
@@ -0,0 +1,514 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+import Cocoa
+import IOKit.pwr_mgt
+
+class CocoaCB: NSObject {
+
+    var mpv: MPVHelper!
+    var window: Window!
+    var view: EventsView!
+    var layer: VideoLayer!
+    var link: CVDisplayLink?
+
+    var cursorHidden: Bool = false
+    var cursorVisibilityWanted: Bool = true
+    var isShuttingDown: Bool = false
+
+    var title: String = "mpv" {
+        didSet { if window != nil { window.title = title } }
+    }
+
+    enum State {
+        case uninitialized
+        case needsInit
+        case initialized
+    }
+    var backendState: State = .uninitialized
+
+    let eventsLock = NSLock()
+    var events: Int = 0
+
+    var lightSensor: io_connect_t = 0
+    var lastLmu: UInt64 = 0
+    var lightSensorIOPort: IONotificationPortRef?
+    var displaySleepAssertion: IOPMAssertionID = IOPMAssertionID(0)
+
+    let queue: DispatchQueue = DispatchQueue(label: "io.mpv.queue")
+
+    convenience init(_ mpvHandle: OpaquePointer) {
+        self.init()
+        mpv = MPVHelper(mpvHandle)
+        layer = VideoLayer(cocoaCB: self)
+    }
+
+    func preinit(_ vo: UnsafeMutablePointer<vo>) {
+        if backendState == .uninitialized {
+            backendState = .needsInit
+
+            if let app = NSApp as? Application {
+                let ptr = mp_get_config_group(mpv.mpctx!, vo.pointee.global,
+                                              app.getMacOSConf())
+                mpv.macOpts = UnsafeMutablePointer<macos_opts>(OpaquePointer(ptr))!.pointee
+            }
+
+            view = EventsView(cocoaCB: self)
+            view.layer = layer
+            view.wantsLayer = true
+            view.layerContentsPlacement = .scaleProportionallyToFit
+            startDisplayLink(vo)
+            initLightSensor()
+            addDisplayReconfigureObserver()
+        }
+    }
+
+    func uninit() {
+        layer.setVideo(false)
+        window.orderOut(nil)
+    }
+
+    func reconfig(_ vo: UnsafeMutablePointer<vo>) {
+        if backendState == .needsInit {
+            DispatchQueue.main.sync { self.initBackend(vo) }
+        } else {
+            DispatchQueue.main.async {
+                self.layer.setVideo(true)
+                self.updateWindowSize(vo)
+                self.layer.update()
+            }
+        }
+    }
+
+    func initBackend(_ vo: UnsafeMutablePointer<vo>) {
+        let opts: mp_vo_opts = vo.pointee.opts.pointee
+        NSApp.setActivationPolicy(.regular)
+        setAppIcon()
+
+        let targetScreen = getScreenBy(id: Int(opts.screen_id)) ?? NSScreen.main()
+        let wr = getWindowGeometry(forScreen: targetScreen!, videoOut: vo)
+        window = Window(contentRect: wr, screen: targetScreen, view: view, cocoaCB: self)
+        updateICCProfile()
+        window.setOnTop(Bool(opts.ontop), Int(opts.ontop_level))
+        window.keepAspect = Bool(opts.keepaspect_window)
+        window.title = title
+        window.border = Bool(opts.border)
+
+        window.isRestorable = false
+        window.makeMain()
+        window.makeKeyAndOrderFront(nil)
+        NSApp.activate(ignoringOtherApps: true)
+        layer.setVideo(true)
+
+        if Bool(opts.fullscreen) {
+            DispatchQueue.main.async {
+                self.window.toggleFullScreen(nil)
+            }
+        } else {
+            window.isMovableByWindowBackground = true
+        }
+
+        backendState = .initialized
+    }
+
+    func updateWindowSize(_ vo: UnsafeMutablePointer<vo>) {
+        let opts: mp_vo_opts = vo.pointee.opts.pointee
+        let targetScreen = getScreenBy(id: Int(opts.screen_id)) ?? NSScreen.main()
+        let wr = getWindowGeometry(forScreen: targetScreen!, videoOut: vo)
+        if !window.isVisible {
+            window.makeKeyAndOrderFront(nil)
+        }
+        layer.atomicDrawingStart()
+        window.updateSize(wr.size)
+    }
+
+    func setAppIcon() {
+        if let app = NSApp as? Application {
+            NSApp.applicationIconImage = app.getMPVIcon()
+        }
+    }
+
+    let linkCallback: CVDisplayLinkOutputCallback = {
+                    (displayLink: CVDisplayLink,
+                           inNow: UnsafePointer<CVTimeStamp>,
+                    inOutputTime: UnsafePointer<CVTimeStamp>,
+                         flagsIn: CVOptionFlags,
+                        flagsOut: UnsafeMutablePointer<CVOptionFlags>,
+              displayLinkContext: UnsafeMutableRawPointer?) -> CVReturn in
+        let ccb: CocoaCB = MPVHelper.bridge(ptr: displayLinkContext!)
+        ccb.mpv.reportRenderFlip()
+        return kCVReturnSuccess
+    }
+
+    func startDisplayLink(_ vo: UnsafeMutablePointer<vo>) {
+        let opts: mp_vo_opts = vo.pointee.opts.pointee
+        let screen = getScreenBy(id: Int(opts.screen_id)) ?? NSScreen.main()
+        let displayId = screen!.deviceDescription["NSScreenNumber"] as! UInt32
+
+        CVDisplayLinkCreateWithActiveCGDisplays(&link)
+        CVDisplayLinkSetCurrentCGDisplay(link!, displayId)
+        if #available(macOS 10.12, *) {
+            CVDisplayLinkSetOutputHandler(link!) { link, now, out, inFlags, outFlags -> CVReturn in
+                self.mpv.reportRenderFlip()
+                return kCVReturnSuccess
+            }
+        } else {
+            CVDisplayLinkSetOutputCallback(link!, linkCallback, MPVHelper.bridge(obj: self))
+        }
+        CVDisplayLinkStart(link!)
+    }
+
+    func stopDisplaylink() {
+        if link != nil && CVDisplayLinkIsRunning(link!) {
+            CVDisplayLinkStop(link!)
+        }
+    }
+
+    func updateDisplaylink() {
+        let displayId = UInt32(window.screen!.deviceDescription["NSScreenNumber"] as! Int)
+        CVDisplayLinkSetCurrentCGDisplay(link!, displayId)
+
+        queue.asyncAfter(deadline: DispatchTime.now() + 0.1) {
+            self.flagEvents(VO_EVENT_WIN_STATE)
+        }
+    }
+
+    func currentFps() -> Double {
+        var actualFps = CVDisplayLinkGetActualOutputVideoRefreshPeriod(link!)
+        let nominalData = CVDisplayLinkGetNominalOutputVideoRefreshPeriod(link!)
+
+        if (nominalData.flags & Int32(CVTimeFlags.isIndefinite.rawValue)) < 1 {
+            let nominalFps = Double(nominalData.timeScale) / Double(nominalData.timeValue)
+
+            if actualFps > 0 {
+                actualFps = 1/actualFps
+            }
+
+            if fabs(actualFps - nominalFps) > 0.1 {
+                mpv.sendVerbose("Falling back to nominal display refresh rate: \(nominalFps)")
+                return nominalFps
+            } else {
+                return actualFps
+            }
+        }
+        mpv.sendWarning("Falling back to standard display refresh rate: 60Hz")
+        return 60.0
+    }
+
+    func enableDisplaySleep() {
+        IOPMAssertionRelease(displaySleepAssertion)
+        displaySleepAssertion = IOPMAssertionID(0)
+    }
+
+    func disableDisplaySleep() {
+        if displaySleepAssertion != IOPMAssertionID(0) { return }
+        IOPMAssertionCreateWithName(
+            kIOPMAssertionTypePreventUserIdleDisplaySleep as CFString,
+            IOPMAssertionLevel(kIOPMAssertionLevelOn),
+            "io.mpv.video_playing_back" as CFString,
+            &displaySleepAssertion)
+    }
+
+    func updateCusorVisibility() {
+        setCursorVisiblility(cursorVisibilityWanted)
+    }
+
+    func setCursorVisiblility(_ visible: Bool) {
+        let visibility = visible ? true : !view.canHideCursor()
+
+        if visibility && cursorHidden {
+            NSCursor.unhide()
+            cursorHidden = false;
+        } else if !visibility && !cursorHidden {
+            NSCursor.hide()
+            cursorHidden = true
+        }
+    }
+
+    func updateICCProfile() {
+        mpv.setRenderICCProfile(window.screen!.colorSpace!)
+        layer.colorspace = window.screen!.colorSpace!.cgColorSpace!
+    }
+
+    func lmuToLux(_ v: UInt64) -> Int {
+        // the polinomial approximation for apple lmu value -> lux was empirically
+        // derived by firefox developers (Apple provides no documentation).
+        // https://bugzilla.mozilla.org/show_bug.cgi?id=793728
+        let power_c4 = 1 / pow(10, 27)
+        let power_c3 = 1 / pow(10, 19)
+        let power_c2 = 1 / pow(10, 12)
+        let power_c1 = 1 / pow(10, 5)
+
+        let term4 = -3.0 * power_c4 * pow(Decimal(v), 4)
+        let term3 = 2.6 * power_c3 * pow(Decimal(v), 3)
+        let term2 = -3.4 * power_c2 * pow(Decimal(v), 2)
+        let term1 = 3.9 * power_c1 * Decimal(v)
+
+        let lux = Int(ceil( Double((term4 + term3 + term2 + term1 - 0.19) as NSNumber)))
+        return Int(lux > 0 ? lux : 0)
+    }
+
+    var lightSensorCallback: IOServiceInterestCallback = { (ctx, service, messageType, messageArgument) -> Void in
+        let ccb: CocoaCB = MPVHelper.bridge(ptr: ctx!)
+
+        var outputs: UInt32 = 2
+        var values: [UInt64] = [0, 0]
+
+        var kr = IOConnectCallMethod(ccb.lightSensor, 0, nil, 0, nil, 0, &values, &outputs, nil, nil)
+        if kr == KERN_SUCCESS {
+            var mean = (values[0] + values[1]) / 2
+            if ccb.lastLmu != mean {
+                ccb.lastLmu = mean
+                ccb.mpv.setRenderLux(ccb.lmuToLux(ccb.lastLmu))
+            }
+        }
+    }
+
+    func initLightSensor() {
+        let srv = IOServiceGetMatchingService(kIOMasterPortDefault, IOServiceMatching("AppleLMUController"))
+        if srv == IO_OBJECT_NULL {
+            mpv.sendVerbose("Can't find an ambient light sensor")
+            return
+        }
+
+        lightSensorIOPort = IONotificationPortCreate(kIOMasterPortDefault)
+        IONotificationPortSetDispatchQueue(lightSensorIOPort, queue)
+        var n = io_object_t()
+        IOServiceAddInterestNotification(lightSensorIOPort, srv, kIOGeneralInterest, lightSensorCallback, MPVHelper.bridge(obj: self), &n)
+        let kr = IOServiceOpen(srv, mach_task_self_, 0, &lightSensor)
+        IOObjectRelease(srv)
+
+        if kr != KERN_SUCCESS {
+            mpv.sendVerbose("Can't start ambient light sensor connection")
+            return
+        }
+        lightSensorCallback(MPVHelper.bridge(obj: self), 0, 0, nil)
+    }
+
+    func uninitLightSensor() {
+        if lightSensorIOPort != nil {
+            IONotificationPortDestroy(lightSensorIOPort)
+            IOObjectRelease(lightSensor)
+        }
+    }
+
+    var reconfigureCallback: CGDisplayReconfigurationCallBack = { (display, flags, userInfo) in
+        if flags.contains(.setModeFlag) {
+            let ccb: CocoaCB = MPVHelper.bridge(ptr: userInfo!)
+            let displayID = (ccb.window.screen!.deviceDescription["NSScreenNumber"] as! NSNumber).intValue
+            if UInt32(displayID) == display {
+                ccb.mpv.sendVerbose("Detected display mode change, updating screen refresh rate\n");
+                ccb.flagEvents(VO_EVENT_WIN_STATE)
+            }
+        }
+    }
+
+    func addDisplayReconfigureObserver() {
+        CGDisplayRegisterReconfigurationCallback(reconfigureCallback, MPVHelper.bridge(obj: self))
+    }
+
+    func removeDisplayReconfigureObserver() {
+        CGDisplayRemoveReconfigurationCallback(reconfigureCallback, MPVHelper.bridge(obj: self))
+    }
+
+    func getTargetScreen(forFullscreen fs: Bool) -> NSScreen? {
+        let screenType = fs ? "fs-screen" : "screen"
+        let screenID = mpv.getStringProperty(screenType) ?? "current"
+
+        switch screenID {
+        case "current", "default", "all":
+            return getScreenBy(id: -1)
+        default:
+            return getScreenBy(id: Int(screenID)!)
+        }
+    }
+
+    func getScreenBy(id screenID: Int) -> NSScreen? {
+        let screens = NSScreen.screens()
+        if screenID >= screens!.count {
+            mpv.sendInfo("Screen ID \(screenID) does not exist, falling back to current device")
+            return nil
+        } else if screenID < 0 {
+            return nil
+        }
+        return screens![screenID]
+    }
+
+    func getWindowGeometry(forScreen targetScreen: NSScreen,
+                           videoOut vo: UnsafeMutablePointer<vo>) -> NSRect {
+        let r = targetScreen.convertRectToBacking(targetScreen.frame)
+        var screenRC: mp_rect = mp_rect(x0: Int32(0),
+                                        y0: Int32(0),
+                                        x1: Int32(r.size.width),
+                                        y1: Int32(r.size.height))
+
+        var geo: vo_win_geometry = vo_win_geometry()
+        vo_calc_window_geometry2(vo, &screenRC, Double(targetScreen.backingScaleFactor), &geo)
+
+        // flip y coordinates
+        geo.win.y1 = Int32(r.size.height) - geo.win.y1
+        geo.win.y0 = Int32(r.size.height) - geo.win.y0
+
+        let wr = NSMakeRect(CGFloat(geo.win.x0), CGFloat(geo.win.y1),
+                            CGFloat(geo.win.x1 - geo.win.x0),
+                            CGFloat(geo.win.y0 - geo.win.y1))
+        return targetScreen.convertRectFromBacking(wr)
+    }
+
+    func flagEvents(_ ev: Int) {
+        eventsLock.lock()
+        events |= ev
+        eventsLock.unlock()
+    }
+
+    func checkEvents() -> Int {
+        eventsLock.lock()
+        let ev = events
+        events = 0
+        eventsLock.unlock()
+        return ev
+    }
+
+    var controlCallback: mp_render_cb_control_fn = { ( vo, ctx, events, request, data ) -> Int32 in
+        let ccb: CocoaCB = MPVHelper.bridge(ptr: ctx!)
+
+        switch mp_voctrl(request) {
+        case VOCTRL_CHECK_EVENTS:
+            events!.pointee = Int32(ccb.checkEvents())
+            return VO_TRUE
+        case VOCTRL_FULLSCREEN:
+            DispatchQueue.main.async {
+                ccb.window.toggleFullScreen(nil)
+            }
+            return VO_TRUE
+        case VOCTRL_GET_FULLSCREEN:
+            let fsData = data!.assumingMemoryBound(to: Int32.self)
+            fsData.pointee = ccb.window.isInFullscreen ? 1 : 0
+            return VO_TRUE
+        case VOCTRL_GET_DISPLAY_FPS:
+            let fps = data!.assumingMemoryBound(to: CDouble.self)
+            fps.pointee = ccb.currentFps()
+            return VO_TRUE
+        case VOCTRL_RESTORE_SCREENSAVER:
+            ccb.enableDisplaySleep()
+            return VO_TRUE
+        case VOCTRL_KILL_SCREENSAVER:
+            ccb.disableDisplaySleep()
+            return VO_TRUE
+        case VOCTRL_SET_CURSOR_VISIBILITY:
+            ccb.cursorVisibilityWanted = data!.assumingMemoryBound(to: CBool.self).pointee
+            DispatchQueue.main.async {
+                ccb.setCursorVisiblility(ccb.cursorVisibilityWanted)
+            }
+            return VO_TRUE
+        case VOCTRL_SET_UNFS_WINDOW_SIZE:
+            let sizeData = data!.assumingMemoryBound(to: Int32.self)
+            let size = UnsafeBufferPointer(start: sizeData, count: 2)
+            var rect = NSMakeRect(0, 0, CGFloat(size[0]), CGFloat(size[1]))
+            DispatchQueue.main.async {
+                if !ccb.mpv.getBoolProperty("hidpi-window-scale") {
+                    rect = ccb.window.currentScreen!.convertRectFromBacking(rect)
+                }
+                ccb.window.updateSize(rect.size)
+            }
+            return VO_TRUE
+        case VOCTRL_GET_WIN_STATE:
+            let minimized = data!.assumingMemoryBound(to: Int32.self)
+            minimized.pointee = ccb.window.isMiniaturized ? VO_WIN_STATE_MINIMIZED : Int32(0)
+            return VO_TRUE
+        case VOCTRL_UPDATE_WINDOW_TITLE:
+            let titleData = data!.assumingMemoryBound(to: Int8.self)
+            let title = String(cString: titleData)
+            DispatchQueue.main.async {
+                ccb.title = String(cString: titleData)
+            }
+            return VO_TRUE
+        case VOCTRL_PREINIT:
+            DispatchQueue.main.sync { ccb.preinit(vo!) }
+            return VO_TRUE
+        case VOCTRL_UNINIT:
+            DispatchQueue.main.async { ccb.uninit() }
+            return VO_TRUE
+        case VOCTRL_RECONFIG:
+            ccb.reconfig(vo!)
+            return VO_TRUE
+        default:
+            return VO_NOTIMPL
+        }
+    }
+
+    func shutdown(_ destroy: Bool = false) {
+        setCursorVisiblility(true)
+        layer.setVideo(false)
+        stopDisplaylink()
+        uninitLightSensor()
+        removeDisplayReconfigureObserver()
+        mpv.deinitRender()
+        mpv.deinitMPV(destroy)
+    }
+
+    func checkShutdown() {
+        if isShuttingDown {
+            shutdown(true)
+        }
+    }
+
+    func processEvent(_ event: UnsafePointer<mpv_event>) {
+        switch event.pointee.event_id {
+        case MPV_EVENT_SHUTDOWN:
+            if window != nil && window.isAnimating {
+                isShuttingDown = true
+                return
+            }
+            shutdown()
+        case MPV_EVENT_PROPERTY_CHANGE:
+            if backendState == .initialized {
+                handlePropertyChange(event)
+            }
+        default:
+            break
+        }
+    }
+
+    func handlePropertyChange(_ event: UnsafePointer<mpv_event>) {
+        let pData = OpaquePointer(event.pointee.data)
+        guard let property = UnsafePointer<mpv_event_property>(pData)?.pointee else {
+            return
+        }
+
+        switch String(cString: property.name) {
+        case "border":
+            if let data = MPVHelper.mpvFlagToBool(property.data) {
+                window.border = data
+            }
+        case "ontop":
+            if let data = MPVHelper.mpvFlagToBool(property.data) {
+                window.setOnTop(data, mpv.getStringProperty("ontop-level") ?? "window")
+            }
+        case "keepaspect-window":
+            if let data = MPVHelper.mpvFlagToBool(property.data) {
+                window.keepAspect = data
+            }
+        case "macos-title-bar-style":
+            if let data = MPVHelper.mpvStringArrayToString(property.data) {
+                window.setTitleBarStyle(data)
+            }
+        default:
+            break
+        }
+    }
+}
diff --git a/video/out/d3d11/context.c b/video/out/d3d11/context.c
index b02d2e8..82c7d16 100644
--- a/video/out/d3d11/context.c
+++ b/video/out/d3d11/context.c
@@ -70,14 +70,6 @@ struct priv {
     IDXGISwapChain *swapchain;
 };
 
-static struct mp_image *d3d11_screenshot(struct ra_swapchain *sw)
-{
-    struct priv *p = sw->ctx->priv;
-    if (!p->swapchain)
-        return NULL;
-    return mp_d3d11_screenshot(p->swapchain);
-}
-
 static struct ra_tex *get_backbuffer(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
@@ -131,6 +123,10 @@ static int d3d11_color_depth(struct ra_swapchain *sw)
 static bool d3d11_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
 {
     struct priv *p = sw->priv;
+
+    if (!p->backbuffer)
+        return false;
+
     *out_fbo = (struct ra_fbo) {
         .tex = p->backbuffer,
         .flip = false,
@@ -177,7 +173,6 @@ static void d3d11_uninit(struct ra_ctx *ctx)
 
 static const struct ra_swapchain_fns d3d11_swapchain = {
     .color_depth  = d3d11_color_depth,
-    .screenshot   = d3d11_screenshot,
     .start_frame  = d3d11_start_frame,
     .submit_frame = d3d11_submit_frame,
     .swap_buffers = d3d11_swap_buffers,
@@ -226,6 +221,8 @@ static bool d3d11_init(struct ra_ctx *ctx)
         goto error;
 
     p->backbuffer = get_backbuffer(ctx);
+    if (!p->backbuffer)
+        goto error;
 
     return true;
 
diff --git a/video/out/d3d11/hwdec_d3d11va.c b/video/out/d3d11/hwdec_d3d11va.c
index d83fdc5..8d22fe3 100644
--- a/video/out/d3d11/hwdec_d3d11va.c
+++ b/video/out/d3d11/hwdec_d3d11va.c
@@ -104,9 +104,12 @@ static int init(struct ra_hwdec *hw)
     ID3D10Multithread_SetMultithreadProtected(multithread, TRUE);
     ID3D10Multithread_Release(multithread);
 
+    static const int subfmts[] = {IMGFMT_NV12, IMGFMT_P010, 0};
     p->hwctx = (struct mp_hwdec_ctx){
         .driver_name = hw->driver->name,
         .av_device_ref = d3d11_wrap_device_ref(p->device),
+        .supported_formats = subfmts,
+        .hw_imgfmt = IMGFMT_D3D11,
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
     return 0;
@@ -236,7 +239,7 @@ static void mapper_unmap(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_d3d11va = {
     .name = "d3d11va",
     .priv_size = sizeof(struct priv_owner),
-    .imgfmts = {IMGFMT_D3D11VA, IMGFMT_D3D11NV12, 0},
+    .imgfmts = {IMGFMT_D3D11, 0},
     .init = init,
     .uninit = uninit,
     .mapper = &(const struct ra_hwdec_mapper_driver){
diff --git a/video/out/d3d11/hwdec_dxva2dxgi.c b/video/out/d3d11/hwdec_dxva2dxgi.c
new file mode 100644
index 0000000..97471d0
--- /dev/null
+++ b/video/out/d3d11/hwdec_dxva2dxgi.c
@@ -0,0 +1,465 @@
+/*
+ * This file is part of mpv.
+ *
+ * mpv is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * mpv is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <windows.h>
+#include <d3d9.h>
+#include <d3d11.h>
+#include <dxva2api.h>
+
+#include "common/common.h"
+#include "osdep/windows_utils.h"
+#include "video/hwdec.h"
+#include "video/d3d.h"
+#include "video/out/d3d11/ra_d3d11.h"
+#include "video/out/gpu/hwdec.h"
+
+struct priv_owner {
+    struct mp_hwdec_ctx hwctx;
+    ID3D11Device *dev11;
+    IDirect3DDevice9Ex *dev9;
+};
+
+struct queue_surf {
+    ID3D11Texture2D *tex11;
+    ID3D11Query *idle11;
+    ID3D11Texture2D *stage11;
+    IDirect3DTexture9 *tex9;
+    IDirect3DSurface9 *surf9;
+    IDirect3DSurface9 *stage9;
+    struct ra_tex *tex;
+
+    bool busy11; // The surface is currently being used by D3D11
+};
+
+struct priv {
+    ID3D11Device *dev11;
+    ID3D11DeviceContext *ctx11;
+    IDirect3DDevice9Ex *dev9;
+
+    // Surface queue stuff. Following Microsoft recommendations, a queue of
+    // surfaces is used to share images between D3D9 and D3D11. This allows
+    // multiple D3D11 frames to be in-flight at once.
+    struct queue_surf **queue;
+    int queue_len;
+    int queue_pos;
+};
+
+static void uninit(struct ra_hwdec *hw)
+{
+    struct priv_owner *p = hw->priv;
+    hwdec_devices_remove(hw->devs, &p->hwctx);
+    av_buffer_unref(&p->hwctx.av_device_ref);
+    SAFE_RELEASE(p->dev11);
+    SAFE_RELEASE(p->dev9);
+}
+
+static int init(struct ra_hwdec *hw)
+{
+    struct priv_owner *p = hw->priv;
+    IDirect3D9Ex *d3d9ex = NULL;
+    int ret = -1;
+    HRESULT hr;
+
+    if (!ra_is_d3d11(hw->ra))
+        goto done;
+    p->dev11 = ra_d3d11_get_device(hw->ra);
+    if (!p->dev11)
+        goto done;
+
+    d3d_load_dlls();
+    if (!d3d9_dll) {
+        MP_FATAL(hw, "Failed to load \"d3d9.dll\": %s\n", mp_LastError_to_str());
+        goto done;
+    }
+    if (!dxva2_dll) {
+        MP_FATAL(hw, "Failed to load \"dxva2.dll\": %s\n", mp_LastError_to_str());
+        goto done;
+    }
+
+    HRESULT (WINAPI *Direct3DCreate9Ex)(UINT SDKVersion, IDirect3D9Ex **ppD3D);
+    Direct3DCreate9Ex = (void *)GetProcAddress(d3d9_dll, "Direct3DCreate9Ex");
+    if (!Direct3DCreate9Ex) {
+        MP_FATAL(hw, "Direct3D 9Ex not supported\n");
+        goto done;
+    }
+
+    hr = Direct3DCreate9Ex(D3D_SDK_VERSION, &d3d9ex);
+    if (FAILED(hr)) {
+        MP_FATAL(hw, "Couldn't create Direct3D9Ex: %s\n", mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    D3DPRESENT_PARAMETERS pparams = {
+        .BackBufferWidth = 16,
+        .BackBufferHeight = 16,
+        .BackBufferCount = 1,
+        .SwapEffect = D3DSWAPEFFECT_DISCARD,
+        .hDeviceWindow = GetDesktopWindow(),
+        .Windowed = TRUE,
+        .Flags = D3DPRESENTFLAG_VIDEO,
+    };
+    hr = IDirect3D9Ex_CreateDeviceEx(d3d9ex, D3DADAPTER_DEFAULT,
+        D3DDEVTYPE_HAL, GetDesktopWindow(), D3DCREATE_NOWINDOWCHANGES |
+        D3DCREATE_FPU_PRESERVE | D3DCREATE_HARDWARE_VERTEXPROCESSING |
+        D3DCREATE_DISABLE_PSGP_THREADING | D3DCREATE_MULTITHREADED, &pparams,
+        NULL, &p->dev9);
+    if (FAILED(hr)) {
+        MP_FATAL(hw, "Failed to create Direct3D9Ex device: %s\n",
+                 mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    // Check if it's possible to StretchRect() from NV12 to XRGB surfaces
+    hr = IDirect3D9Ex_CheckDeviceFormatConversion(d3d9ex, D3DADAPTER_DEFAULT,
+        D3DDEVTYPE_HAL, MAKEFOURCC('N', 'V', '1', '2'), D3DFMT_X8R8G8B8);
+    if (hr != S_OK) {
+        MP_FATAL(hw, "Can't StretchRect from NV12 to XRGB surfaces\n");
+        goto done;
+    }
+
+    p->hwctx = (struct mp_hwdec_ctx){
+        .driver_name = hw->driver->name,
+        .av_device_ref = d3d9_wrap_device_ref((IDirect3DDevice9 *)p->dev9),
+    };
+    hwdec_devices_add(hw->devs, &p->hwctx);
+
+    ret = 0;
+done:
+    SAFE_RELEASE(d3d9ex);
+    return ret;
+}
+
+static int mapper_init(struct ra_hwdec_mapper *mapper)
+{
+    struct priv_owner *o = mapper->owner->priv;
+    struct priv *p = mapper->priv;
+
+    p->dev11 = o->dev11;
+    p->dev9 = o->dev9;
+    ID3D11Device_GetImmediateContext(o->dev11, &p->ctx11);
+
+    mapper->dst_params = mapper->src_params;
+    mapper->dst_params.imgfmt = IMGFMT_RGB0;
+    mapper->dst_params.hw_subfmt = 0;
+    return 0;
+}
+
+static void surf_destroy(struct ra_hwdec_mapper *mapper,
+                         struct queue_surf *surf)
+{
+    if (!surf)
+        return;
+    SAFE_RELEASE(surf->tex11);
+    SAFE_RELEASE(surf->idle11);
+    SAFE_RELEASE(surf->stage11);
+    SAFE_RELEASE(surf->tex9);
+    SAFE_RELEASE(surf->surf9);
+    SAFE_RELEASE(surf->stage9);
+    ra_tex_free(mapper->ra, &surf->tex);
+    talloc_free(surf);
+}
+
+static struct queue_surf *surf_create(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+    IDXGIResource *res11 = NULL;
+    bool success = false;
+    HRESULT hr;
+
+    struct queue_surf *surf = talloc_ptrtype(p, surf);
+
+    D3D11_TEXTURE2D_DESC desc11 = {
+        .Width = mapper->src->w,
+        .Height = mapper->src->h,
+        .MipLevels = 1,
+        .ArraySize = 1,
+        .Format = DXGI_FORMAT_B8G8R8X8_UNORM,
+        .SampleDesc.Count = 1,
+        .Usage = D3D11_USAGE_DEFAULT,
+        .BindFlags = D3D11_BIND_SHADER_RESOURCE | D3D11_BIND_RENDER_TARGET,
+        .MiscFlags = D3D11_RESOURCE_MISC_SHARED,
+    };
+    hr = ID3D11Device_CreateTexture2D(p->dev11, &desc11, NULL, &surf->tex11);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to create D3D11 texture: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    // Try to use a 16x16 staging texture, unless the source surface is
+    // smaller. Ideally, a 1x1 texture would be sufficient, but Microsoft's
+    // D3D9ExDXGISharedSurf example uses 16x16 to avoid driver bugs.
+    D3D11_TEXTURE2D_DESC sdesc11 = {
+        .Width = MPMIN(16, desc11.Width),
+        .Height = MPMIN(16, desc11.Height),
+        .MipLevels = 1,
+        .ArraySize = 1,
+        .Format = DXGI_FORMAT_B8G8R8X8_UNORM,
+        .SampleDesc.Count = 1,
+        .Usage = D3D11_USAGE_STAGING,
+        .CPUAccessFlags = D3D11_CPU_ACCESS_READ,
+    };
+    hr = ID3D11Device_CreateTexture2D(p->dev11, &sdesc11, NULL, &surf->stage11);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to create D3D11 staging texture: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    hr = ID3D11Texture2D_QueryInterface(surf->tex11, &IID_IDXGIResource,
+                                        (void**)&res11);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to get share handle: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    HANDLE share_handle;
+    hr = IDXGIResource_GetSharedHandle(res11, &share_handle);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to get share handle: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    hr = ID3D11Device_CreateQuery(p->dev11,
+        &(D3D11_QUERY_DESC) { D3D11_QUERY_EVENT }, &surf->idle11);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to create D3D11 query: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    // Share the D3D11 texture with D3D9Ex
+    hr = IDirect3DDevice9Ex_CreateTexture(p->dev9, desc11.Width, desc11.Height,
+        1, D3DUSAGE_RENDERTARGET, D3DFMT_X8R8G8B8, D3DPOOL_DEFAULT,
+        &surf->tex9, &share_handle);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to create D3D9 texture: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    hr = IDirect3DTexture9_GetSurfaceLevel(surf->tex9, 0, &surf->surf9);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to get D3D9 surface: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    // As above, try to use a 16x16 staging texture to avoid driver bugs
+    hr = IDirect3DDevice9Ex_CreateRenderTarget(p->dev9,
+        MPMIN(16, desc11.Width), MPMIN(16, desc11.Height), D3DFMT_X8R8G8B8,
+        D3DMULTISAMPLE_NONE, 0, TRUE, &surf->stage9, NULL);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Failed to create D3D9 staging surface: %s\n",
+               mp_HRESULT_to_str(hr));
+        goto done;
+    }
+
+    surf->tex = ra_d3d11_wrap_tex(mapper->ra, (ID3D11Resource *)surf->tex11);
+    if (!surf->tex)
+        goto done;
+
+    success = true;
+done:
+    if (!success)
+        surf_destroy(mapper, surf);
+    SAFE_RELEASE(res11);
+    return success ? surf : NULL;
+}
+
+// true if the surface is currently in-use by the D3D11 graphics pipeline
+static bool surf_is_idle11(struct ra_hwdec_mapper *mapper,
+                           struct queue_surf *surf)
+{
+    struct priv *p = mapper->priv;
+    HRESULT hr;
+    BOOL idle;
+
+    if (!surf->busy11)
+        return true;
+
+    hr = ID3D11DeviceContext_GetData(p->ctx11,
+        (ID3D11Asynchronous *)surf->idle11, &idle, sizeof(idle),
+        D3D11_ASYNC_GETDATA_DONOTFLUSH);
+    if (FAILED(hr) || hr == S_FALSE || !idle)
+        return false;
+
+    surf->busy11 = false;
+    return true;
+}
+
+// If the surface is currently in-use by the D3D11 graphics pipeline, wait for
+// it to become idle. Should only be called in the queue-underflow case.
+static bool surf_wait_idle11(struct ra_hwdec_mapper *mapper,
+                             struct queue_surf *surf)
+{
+    struct priv *p = mapper->priv;
+    HRESULT hr;
+
+    ID3D11DeviceContext_CopySubresourceRegion(p->ctx11,
+        (ID3D11Resource *)surf->stage11, 0, 0, 0, 0,
+        (ID3D11Resource *)surf->tex11, 0, (&(D3D11_BOX){
+            .right = MPMIN(16, mapper->src->w),
+            .bottom = MPMIN(16, mapper->src->h),
+            .back = 1,
+        }));
+
+    // Block until the surface becomes idle (see surf_wait_idle9())
+    D3D11_MAPPED_SUBRESOURCE map = {0};
+    hr = ID3D11DeviceContext_Map(p->ctx11, (ID3D11Resource *)surf->stage11, 0,
+                                 D3D11_MAP_READ, 0, &map);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Couldn't map D3D11 staging texture: %s\n",
+               mp_HRESULT_to_str(hr));
+        return false;
+    }
+
+    ID3D11DeviceContext_Unmap(p->ctx11, (ID3D11Resource *)surf->stage11, 0);
+    surf->busy11 = false;
+    return true;
+}
+
+static bool surf_wait_idle9(struct ra_hwdec_mapper *mapper,
+                            struct queue_surf *surf)
+{
+    struct priv *p = mapper->priv;
+    HRESULT hr;
+
+    // Rather than polling for the surface to become idle, copy part of the
+    // surface to a staging texture and map it. This should block until the
+    // surface becomes idle. Microsoft's ISurfaceQueue does this as well.
+    RECT rc = {0, 0, MPMIN(16, mapper->src->w), MPMIN(16, mapper->src->h)};
+    hr = IDirect3DDevice9Ex_StretchRect(p->dev9, surf->surf9, &rc, surf->stage9,
+                                        &rc, D3DTEXF_NONE);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Couldn't copy to D3D9 staging texture: %s\n",
+               mp_HRESULT_to_str(hr));
+        return false;
+    }
+
+    D3DLOCKED_RECT lock;
+    hr = IDirect3DSurface9_LockRect(surf->stage9, &lock, NULL, D3DLOCK_READONLY);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "Couldn't map D3D9 staging texture: %s\n",
+               mp_HRESULT_to_str(hr));
+        return false;
+    }
+
+    IDirect3DSurface9_UnlockRect(surf->stage9);
+    p->queue[p->queue_pos]->busy11 = true;
+    return true;
+}
+
+static struct queue_surf *surf_acquire(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+
+    if (!p->queue_len || !surf_is_idle11(mapper, p->queue[p->queue_pos])) {
+        if (p->queue_len < 16) {
+            struct queue_surf *surf = surf_create(mapper);
+            if (!surf)
+                return NULL;
+
+            // The next surface is busy, so grow the queue
+            MP_TARRAY_INSERT_AT(p, p->queue, p->queue_len, p->queue_pos, surf);
+            MP_DBG(mapper, "Queue grew to %d surfaces\n", p->queue_len);
+        } else {
+            // For sanity, don't let the queue grow beyond 16 surfaces. It
+            // should never get this big. If it does, wait for the surface to
+            // become idle rather than polling it.
+            if (!surf_wait_idle11(mapper, p->queue[p->queue_pos]))
+                return NULL;
+            MP_WARN(mapper, "Queue underflow!\n");
+        }
+    }
+    return p->queue[p->queue_pos];
+}
+
+static void surf_release(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+    ID3D11DeviceContext_End(p->ctx11,
+        (ID3D11Asynchronous *)p->queue[p->queue_pos]->idle11);
+
+    // The current surface is now in-flight, move to the next surface
+    p->queue_pos++;
+    if (p->queue_pos >= p->queue_len)
+        p->queue_pos = 0;
+}
+
+static void mapper_uninit(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+
+    for (int i = 0; i < p->queue_len; i++)
+        surf_destroy(mapper, p->queue[i]);
+}
+
+static int mapper_map(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+    HRESULT hr;
+
+    struct queue_surf *surf = surf_acquire(mapper);
+    if (!surf)
+        return -1;
+
+    RECT rc = {0, 0, mapper->src->w, mapper->src->h};
+    IDirect3DSurface9* hw_surface = (IDirect3DSurface9 *)mapper->src->planes[3];
+
+    hr = IDirect3DDevice9Ex_StretchRect(p->dev9, hw_surface, &rc, surf->surf9,
+                                        &rc, D3DTEXF_NONE);
+    if (FAILED(hr)) {
+        MP_ERR(mapper, "StretchRect() failed: %s\n", mp_HRESULT_to_str(hr));
+        return -1;
+    }
+
+    if (!surf_wait_idle9(mapper, surf))
+        return -1;
+
+    mapper->tex[0] = surf->tex;
+    return 0;
+}
+
+static void mapper_unmap(struct ra_hwdec_mapper *mapper)
+{
+    struct priv *p = mapper->priv;
+
+    if (p->queue_pos < p->queue_len &&
+        p->queue[p->queue_pos]->tex == mapper->tex[0])
+    {
+        surf_release(mapper);
+        mapper->tex[0] = NULL;
+    }
+}
+
+const struct ra_hwdec_driver ra_hwdec_dxva2dxgi = {
+    .name = "dxva2-dxgi",
+    .priv_size = sizeof(struct priv_owner),
+    .imgfmts = {IMGFMT_DXVA2, 0},
+    .init = init,
+    .uninit = uninit,
+    .mapper = &(const struct ra_hwdec_mapper_driver){
+        .priv_size = sizeof(struct priv),
+        .init = mapper_init,
+        .uninit = mapper_uninit,
+        .map = mapper_map,
+        .unmap = mapper_unmap,
+    },
+};
diff --git a/video/out/d3d11/ra_d3d11.c b/video/out/d3d11/ra_d3d11.c
index 63dc5b9..1d24558 100644
--- a/video/out/d3d11/ra_d3d11.c
+++ b/video/out/d3d11/ra_d3d11.c
@@ -78,6 +78,9 @@ struct d3d_tex {
     ID3D11Texture3D *tex3d;
     int array_slice;
 
+    // Staging texture for tex_download(), 2D only
+    ID3D11Texture2D *staging;
+
     ID3D11ShaderResourceView *srv;
     ID3D11RenderTargetView *rtv;
     ID3D11UnorderedAccessView *uav;
@@ -86,9 +89,9 @@ struct d3d_tex {
 
 struct d3d_buf {
     ID3D11Buffer *buf;
-    ID3D11Buffer *staging;
     ID3D11UnorderedAccessView *uav;
-    void *data; // Data for mapped staging texture
+    void *data; // System-memory mirror of the data in buf
+    bool dirty; // Is buf out of date?
 };
 
 struct d3d_rpass {
@@ -181,6 +184,7 @@ static struct d3d_fmt formats[] = {
 
     { "rgb10_a2", 4,  4, {10, 10, 10,  2}, DXFMT(R10G10B10A2, UNORM)  },
     { "bgra8",    4,  4, { 8,  8,  8,  8}, DXFMT(B8G8R8A8, UNORM), .unordered = true },
+    { "bgrx8",    3,  4, { 8,  8,  8},     DXFMT(B8G8R8X8, UNORM), .unordered = true },
 };
 
 static bool dll_version_equal(struct dll_version a, struct dll_version b)
@@ -358,12 +362,17 @@ static void tex_destroy(struct ra *ra, struct ra_tex *tex)
     SAFE_RELEASE(tex_p->uav);
     SAFE_RELEASE(tex_p->sampler);
     SAFE_RELEASE(tex_p->res);
+    SAFE_RELEASE(tex_p->staging);
     talloc_free(tex);
 }
 
 static struct ra_tex *tex_create(struct ra *ra,
                                  const struct ra_tex_params *params)
 {
+    // Only 2D textures may be downloaded for now
+    if (params->downloadable && params->dimensions != 2)
+        return NULL;
+
     struct ra_d3d11 *p = ra->priv;
     HRESULT hr;
 
@@ -436,6 +445,21 @@ static struct ra_tex *tex_create(struct ra *ra,
             goto error;
         }
         tex_p->res = (ID3D11Resource *)tex_p->tex2d;
+
+        // Create a staging texture with CPU access for tex_download()
+        if (params->downloadable) {
+            desc2d.BindFlags = 0;
+            desc2d.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+            desc2d.Usage = D3D11_USAGE_STAGING;
+
+            hr = ID3D11Device_CreateTexture2D(p->dev, &desc2d, NULL,
+                                              &tex_p->staging);
+            if (FAILED(hr)) {
+                MP_ERR(ra, "Failed to staging texture: %s\n",
+                       mp_HRESULT_to_str(hr));
+                goto error;
+            }
+        }
         break;
     case 3:;
         D3D11_TEXTURE3D_DESC desc3d = {
@@ -651,17 +675,45 @@ static bool tex_upload(struct ra *ra, const struct ra_tex_upload_params *params)
     return true;
 }
 
+static bool tex_download(struct ra *ra, struct ra_tex_download_params *params)
+{
+    struct ra_d3d11 *p = ra->priv;
+    struct ra_tex *tex = params->tex;
+    struct d3d_tex *tex_p = tex->priv;
+    HRESULT hr;
+
+    if (!tex_p->staging)
+        return false;
+
+    ID3D11DeviceContext_CopyResource(p->ctx, (ID3D11Resource*)tex_p->staging,
+        tex_p->res);
+
+    D3D11_MAPPED_SUBRESOURCE lock;
+    hr = ID3D11DeviceContext_Map(p->ctx, (ID3D11Resource*)tex_p->staging, 0,
+                                 D3D11_MAP_READ, 0, &lock);
+    if (FAILED(hr)) {
+        MP_ERR(ra, "Failed to map staging texture: %s\n", mp_HRESULT_to_str(hr));
+        return false;
+    }
+
+    char *cdst = params->dst;
+    char *csrc = lock.pData;
+    for (int y = 0; y < tex->params.h; y++) {
+        memcpy(cdst + y * params->stride, csrc + y * lock.RowPitch,
+               MPMIN(params->stride, lock.RowPitch));
+    }
+
+    ID3D11DeviceContext_Unmap(p->ctx, (ID3D11Resource*)tex_p->staging, 0);
+
+    return true;
+}
+
 static void buf_destroy(struct ra *ra, struct ra_buf *buf)
 {
     if (!buf)
         return;
-    struct ra_d3d11 *p = ra->priv;
     struct d3d_buf *buf_p = buf->priv;
-
-    if (buf_p->data)
-        ID3D11DeviceContext_Unmap(p->ctx, (ID3D11Resource *)buf_p->staging, 0);
     SAFE_RELEASE(buf_p->buf);
-    SAFE_RELEASE(buf_p->staging);
     SAFE_RELEASE(buf_p->uav);
     talloc_free(buf);
 }
@@ -705,24 +757,13 @@ static struct ra_buf *buf_create(struct ra *ra,
         goto error;
     }
 
-    if (params->host_mutable) {
-        // D3D11 doesn't allow constant buffer updates that aren't aligned to a
-        // full constant boundary (vec4,) and some drivers don't allow partial
-        // constant buffer updates at all, but the RA consumer is allowed to
-        // partially update an ra_buf. The best way to handle partial updates
-        // without causing a pipeline stall is probably to keep a copy of the
-        // data in a staging buffer.
-
-        desc.Usage = D3D11_USAGE_STAGING;
-        desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
-        desc.BindFlags = 0;
-        hr = ID3D11Device_CreateBuffer(p->dev, &desc, NULL, &buf_p->staging);
-        if (FAILED(hr)) {
-            MP_ERR(ra, "Failed to create staging buffer: %s\n",
-                   mp_HRESULT_to_str(hr));
-            goto error;
-        }
-    }
+    // D3D11 doesn't allow constant buffer updates that aren't aligned to a
+    // full constant boundary (vec4,) and some drivers don't allow partial
+    // constant buffer updates at all. To support partial buffer updates, keep
+    // a mirror of the buffer data in system memory and upload the whole thing
+    // before the buffer is used.
+    if (params->host_mutable)
+        buf_p->data = talloc_zero_size(buf, desc.ByteWidth);
 
     if (params->type == RA_BUF_TYPE_SHADER_STORAGE) {
         D3D11_UNORDERED_ACCESS_VIEW_DESC udesc = {
@@ -752,40 +793,23 @@ static void buf_resolve(struct ra *ra, struct ra_buf *buf)
     struct ra_d3d11 *p = ra->priv;
     struct d3d_buf *buf_p = buf->priv;
 
-    assert(buf->params.host_mutable);
-    if (!buf_p->data)
+    if (!buf->params.host_mutable || !buf_p->dirty)
         return;
 
-    ID3D11DeviceContext_Unmap(p->ctx, (ID3D11Resource *)buf_p->staging, 0);
-    buf_p->data = NULL;
-
-    // Synchronize the GPU buffer with the staging buffer
-    ID3D11DeviceContext_CopyResource(p->ctx, (ID3D11Resource *)buf_p->buf,
-                                     (ID3D11Resource *)buf_p->staging);
+    // Synchronize the GPU buffer with the system-memory copy
+    ID3D11DeviceContext_UpdateSubresource(p->ctx, (ID3D11Resource *)buf_p->buf,
+        0, NULL, buf_p->data, 0, 0);
+    buf_p->dirty = false;
 }
 
 static void buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
                        const void *data, size_t size)
 {
-    struct ra_d3d11 *p = ra->priv;
     struct d3d_buf *buf_p = buf->priv;
-    HRESULT hr;
-
-    if (!buf_p->data) {
-        // If this is the first update after the buffer was created or after it
-        // has been used in a renderpass, it will be unmapped, so map it
-        D3D11_MAPPED_SUBRESOURCE map = {0};
-        hr = ID3D11DeviceContext_Map(p->ctx, (ID3D11Resource *)buf_p->staging,
-                                     0, D3D11_MAP_WRITE, 0, &map);
-        if (FAILED(hr)) {
-            MP_ERR(ra, "Failed to map resource\n");
-            return;
-        }
-        buf_p->data = map.pData;
-    }
 
     char *cdata = buf_p->data;
     memcpy(cdata + offset, data, size);
+    buf_p->dirty = true;
 }
 
 static const char *get_shader_target(struct ra *ra, enum glsl_shader type)
@@ -2077,6 +2101,7 @@ static struct ra_fns ra_fns_d3d11 = {
     .tex_create         = tex_create,
     .tex_destroy        = tex_destroy,
     .tex_upload         = tex_upload,
+    .tex_download       = tex_download,
     .buf_create         = buf_create,
     .buf_destroy        = buf_destroy,
     .buf_update         = buf_update,
diff --git a/video/out/dr_helper.c b/video/out/dr_helper.c
new file mode 100644
index 0000000..e826d08
--- /dev/null
+++ b/video/out/dr_helper.c
@@ -0,0 +1,130 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include <libavutil/buffer.h>
+
+#include "mpv_talloc.h"
+#include "misc/dispatch.h"
+#include "osdep/atomic.h"
+#include "video/mp_image.h"
+
+#include "dr_helper.h"
+
+struct dr_helper {
+    pthread_t thread;
+    struct mp_dispatch_queue *dispatch;
+    atomic_ullong dr_in_flight;
+
+    struct mp_image *(*get_image)(void *ctx, int imgfmt, int w, int h,
+                                  int stride_align);
+    void *get_image_ctx;
+};
+
+static void dr_helper_destroy(void *ptr)
+{
+    struct dr_helper *dr = ptr;
+
+    // All references must have been freed on destruction, or we'll have
+    // dangling pointers.
+    assert(atomic_load(&dr->dr_in_flight) == 0);
+}
+
+struct dr_helper *dr_helper_create(struct mp_dispatch_queue *dispatch,
+            struct mp_image *(*get_image)(void *ctx, int imgfmt, int w, int h,
+                                          int stride_align),
+            void *get_image_ctx)
+{
+    struct dr_helper *dr = talloc_ptrtype(NULL, dr);
+    talloc_set_destructor(dr, dr_helper_destroy);
+    *dr = (struct dr_helper){
+        .thread = pthread_self(),
+        .dispatch = dispatch,
+        .dr_in_flight = ATOMIC_VAR_INIT(0),
+        .get_image = get_image,
+        .get_image_ctx = get_image_ctx,
+    };
+    return dr;
+}
+
+struct free_dr_context {
+    struct dr_helper *dr;
+    AVBufferRef *ref;
+};
+
+static void dr_thread_free(void *ptr)
+{
+    struct free_dr_context *ctx = ptr;
+
+    unsigned long long v = atomic_fetch_add(&ctx->dr->dr_in_flight, -1);
+    assert(v); // value before sub is 0 - unexpected underflow.
+
+    av_buffer_unref(&ctx->ref);
+    talloc_free(ctx);
+}
+
+static void free_dr_buffer_on_dr_thread(void *opaque, uint8_t *data)
+{
+    struct free_dr_context *ctx = opaque;
+
+    // The image could be unreffed even on the DR thread. In practice, this
+    // matters most on DR destruction.
+    if (pthread_equal(ctx->dr->thread, pthread_self())) {
+        dr_thread_free(ctx);
+    } else {
+        mp_dispatch_run(ctx->dr->dispatch, dr_thread_free, ctx);
+    }
+}
+
+struct get_image_cmd {
+    struct dr_helper *dr;
+    int imgfmt, w, h, stride_align;
+    struct mp_image *res;
+};
+
+static void sync_get_image(void *ptr)
+{
+    struct get_image_cmd *cmd = ptr;
+    struct dr_helper *dr = cmd->dr;
+
+    cmd->res = dr->get_image(dr->get_image_ctx, cmd->imgfmt, cmd->w, cmd->h,
+                             cmd->stride_align);
+    if (!cmd->res)
+        return;
+
+    // We require exactly 1 AVBufferRef.
+    assert(cmd->res->bufs[0]);
+    assert(!cmd->res->bufs[1]);
+
+    // Apply some magic to get it free'd on the DR thread as well. For this to
+    // work, we create a dummy-ref that aliases the original ref, which is why
+    // the original ref must be writable in the first place. (A newly allocated
+    // image should be always writable of course.)
+    assert(mp_image_is_writeable(cmd->res));
+
+    struct free_dr_context *ctx = talloc_zero(NULL, struct free_dr_context);
+    *ctx = (struct free_dr_context){
+        .dr = dr,
+        .ref = cmd->res->bufs[0],
+    };
+
+    AVBufferRef *new_ref = av_buffer_create(ctx->ref->data, ctx->ref->size,
+                                            free_dr_buffer_on_dr_thread, ctx, 0);
+    if (!new_ref)
+        abort(); // tiny malloc OOM
+
+    cmd->res->bufs[0] = new_ref;
+
+    atomic_fetch_add(&dr->dr_in_flight, 1);
+}
+
+struct mp_image *dr_helper_get_image(struct dr_helper *dr, int imgfmt,
+                                     int w, int h, int stride_align)
+{
+    struct get_image_cmd cmd = {
+        .dr = dr,
+        .imgfmt = imgfmt, .w = w, .h = h, .stride_align = stride_align,
+    };
+    mp_dispatch_run(dr->dispatch, sync_get_image, &cmd);
+    return cmd.res;
+}
diff --git a/video/out/dr_helper.h b/video/out/dr_helper.h
new file mode 100644
index 0000000..cf37c57
--- /dev/null
+++ b/video/out/dr_helper.h
@@ -0,0 +1,20 @@
+#pragma once
+
+// This is a helper for implementing thread-safety for DR callbacks. These need
+// to allocate GPU buffers on the GPU thread (e.g. OpenGL with its forced TLS),
+// and the buffers also need to be freed on the GPU thread.
+struct dr_helper;
+
+struct mp_image;
+struct mp_dispatch_queue;
+
+// This MUST be called on the "target" thread (it will call pthread_self()).
+// dr_helper_get_image() calls will use the dispatch queue to run get_image on
+// the target thread too.
+struct dr_helper *dr_helper_create(struct mp_dispatch_queue *dispatch,
+            struct mp_image *(*get_image)(void *ctx, int imgfmt, int w, int h,
+                                          int stride_align),
+            void *get_image_ctx);
+
+struct mp_image *dr_helper_get_image(struct dr_helper *dr, int imgfmt,
+                                     int w, int h, int stride_align);
diff --git a/video/out/drm_atomic.c b/video/out/drm_atomic.c
index 7a55483..5c6b3bb 100644
--- a/video/out/drm_atomic.c
+++ b/video/out/drm_atomic.c
@@ -78,6 +78,17 @@ int drm_object_get_property(struct drm_object *object, char *name, uint64_t *val
    return -EINVAL;
 }
 
+drmModePropertyBlobPtr drm_object_get_property_blob(struct drm_object *object, char *name)
+{
+   uint64_t blob_id;
+
+   if (!drm_object_get_property(object, name, &blob_id)) {
+       return drmModeGetPropertyBlob(object->fd, blob_id);
+   }
+
+   return NULL;
+}
+
 int drm_object_set_property(drmModeAtomicReq *request, struct drm_object *object,
                             char *name, uint64_t value)
 {
@@ -98,6 +109,7 @@ struct drm_object * drm_object_create(struct mp_log *log, int fd,
     obj = talloc_zero(NULL, struct drm_object);
     obj->id = object_id;
     obj->type = type;
+    obj->fd = fd;
 
     if (drm_object_create_properties(log, fd, obj)) {
         talloc_free(obj);
@@ -125,16 +137,18 @@ void drm_object_print_info(struct mp_log *log, struct drm_object *object)
                (long long)object->props->prop_values[i]);
 }
 
-struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd,
-                                                     int crtc_id, int overlay_id)
+struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd, int crtc_id,
+                                                     int connector_id, int osd_plane_id, int video_plane_id)
 {
-    drmModePlane *drmplane = NULL;
     drmModePlaneRes *plane_res = NULL;
     drmModeRes *res = NULL;
     struct drm_object *plane = NULL;
     struct drm_atomic_context *ctx;
     int crtc_index = -1;
-    int layercount = 0;
+    int layercount = -1;
+    int primary_id = 0;
+    int overlay_id = 0;
+
     uint64_t value;
 
     res = drmModeGetResources(fd);
@@ -169,55 +183,95 @@ struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd,
         }
     }
 
+    for (int i = 0; i < res->count_connectors; i++) {
+        drmModeConnector *connector = drmModeGetConnector(fd, res->connectors[i]);
+        if (connector) {
+            if (connector->connector_id == connector_id)
+                ctx->connector =  drm_object_create(log, ctx->fd, connector->connector_id,
+                                                    DRM_MODE_OBJECT_CONNECTOR);
+
+            drmModeFreeConnector(connector);
+            if (ctx->connector)
+                break;
+        }
+    }
+
     for (unsigned int j = 0; j < plane_res->count_planes; j++) {
 
-        drmplane = drmModeGetPlane (ctx->fd, plane_res->planes[j]);
-        if (drmplane->possible_crtcs & (1 << crtc_index)) {
-            plane = drm_object_create(log, ctx->fd, drmplane->plane_id,
+        drmModePlane *drmplane = drmModeGetPlane(ctx->fd, plane_res->planes[j]);
+        const uint32_t possible_crtcs = drmplane->possible_crtcs;
+        const uint32_t plane_id = drmplane->plane_id;
+        drmModeFreePlane(drmplane);
+        drmplane = NULL;
+
+        if (possible_crtcs & (1 << crtc_index)) {
+            plane = drm_object_create(log, ctx->fd, plane_id,
                                       DRM_MODE_OBJECT_PLANE);
 
-            if (plane) {
-                if (drm_object_get_property(plane, "TYPE", &value) == -EINVAL) {
-                    mp_err(log, "Unable to retrieve type property from plane %d\n", j);
-                    goto fail;
-                } else {
-                    if ((value == DRM_PLANE_TYPE_OVERLAY) &&
-                            (layercount == overlay_id)) {
-                        ctx->overlay_plane = plane;
-                    }
-                    else if (value == DRM_PLANE_TYPE_PRIMARY) {
-                        ctx->primary_plane = plane;
-                    }
-                    else {
-                        drm_object_free(plane);
-                        plane = NULL;
-                    }
-
-                    if (value == DRM_PLANE_TYPE_OVERLAY)
-                        layercount++;
-                }
-            } else {
+            if (!plane) {
                 mp_err(log, "Failed to create Plane object from plane ID %d\n",
-                       drmplane->plane_id);
+                       plane_id);
                 goto fail;
             }
+
+            if (drm_object_get_property(plane, "TYPE", &value) == -EINVAL) {
+                mp_err(log, "Unable to retrieve type property from plane %d\n", j);
+                goto fail;
+            }
+
+            if (value != DRM_PLANE_TYPE_CURSOR) { // Skip cursor planes
+                layercount++;
+
+                if ((!primary_id) && (value == DRM_PLANE_TYPE_PRIMARY))
+                    primary_id = plane_id;
+
+                if ((!overlay_id) && (value == DRM_PLANE_TYPE_OVERLAY))
+                    overlay_id = plane_id;
+
+                if (layercount == osd_plane_id) {
+                    ctx->osd_plane = plane;
+                    continue;
+                }
+
+                if (layercount == video_plane_id) {
+                    ctx->video_plane = plane;
+                    continue;
+                }
+            }
+
+            drm_object_free(plane);
+            plane = NULL;
         }
-        drmModeFreePlane(drmplane);
-        drmplane = NULL;
     }
 
-    if (!ctx->primary_plane) {
-        mp_err(log, "Failed to find primary plane\n");
-        goto fail;
+    // default OSD plane to primary if unspecified
+    if (!ctx->osd_plane) {
+        if (primary_id) {
+            mp_verbose(log, "Using default plane %d for OSD\n", primary_id);
+            ctx->osd_plane = drm_object_create(log, ctx->fd, primary_id, DRM_MODE_OBJECT_PLANE);
+        } else {
+            mp_err(log, "Failed to find OSD plane with id=%d\n", osd_plane_id);
+            goto fail;
+        }
+    } else {
+        mp_verbose(log, "Found OSD plane with ID %d\n", ctx->osd_plane->id);
     }
 
-    if (!ctx->overlay_plane) {
-        mp_err(log, "Failed to find overlay plane with id=%d\n", overlay_id);
-        goto fail;
+    // default video plane to overlay if unspecified
+    if (!ctx->video_plane) {
+        if (overlay_id) {
+            mp_verbose(log, "Using default plane %d for video\n", overlay_id);
+            ctx->video_plane = drm_object_create(log, ctx->fd, overlay_id, DRM_MODE_OBJECT_PLANE);
+        } else {
+            mp_err(log, "Failed to find video plane with id=%d\n", video_plane_id);
+            goto fail;
+        }
+    } else {
+        mp_verbose(log, "Found video plane with ID %d\n", ctx->video_plane->id);
     }
 
-    mp_verbose(log, "Found Primary plane with ID %d, overlay with ID %d\n",
-               ctx->primary_plane->id, ctx->overlay_plane->id);
+    mp_verbose(log, "Found Video plane with ID %d, OSD with ID %d\n",
+               ctx->video_plane->id, ctx->osd_plane->id);
 
     drmModeFreePlaneResources(plane_res);
     drmModeFreeResources(res);
@@ -229,8 +283,6 @@ fail:
         drmModeFreeResources(res);
     if (plane_res)
         drmModeFreePlaneResources(plane_res);
-    if (drmplane)
-        drmModeFreePlane(drmplane);
     if (plane)
         drm_object_free(plane);
     return NULL;
@@ -238,8 +290,153 @@ fail:
 
 void drm_atomic_destroy_context(struct drm_atomic_context *ctx)
 {
+    drm_mode_destroy_blob(ctx->fd, &ctx->old_state.crtc.mode);
     drm_object_free(ctx->crtc);
-    drm_object_free(ctx->primary_plane);
-    drm_object_free(ctx->overlay_plane);
+    drm_object_free(ctx->connector);
+    drm_object_free(ctx->osd_plane);
+    drm_object_free(ctx->video_plane);
     talloc_free(ctx);
 }
+
+static bool drm_atomic_save_plane_state(struct drm_object *plane,
+                                        struct drm_atomic_plane_state *plane_state)
+{
+    bool ret = true;
+
+    if (0 > drm_object_get_property(plane, "FB_ID", &plane_state->fb_id))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "CRTC_ID", &plane_state->crtc_id))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "SRC_X", &plane_state->src_x))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "SRC_Y", &plane_state->src_y))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "SRC_W", &plane_state->src_w))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "SRC_H", &plane_state->src_h))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "CRTC_X", &plane_state->crtc_x))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "CRTC_Y", &plane_state->crtc_y))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "CRTC_W", &plane_state->crtc_w))
+        ret = false;
+    if (0 > drm_object_get_property(plane, "CRTC_H", &plane_state->crtc_h))
+        ret = false;
+    // ZPOS might not exist, so ignore whether or not this succeeds
+    drm_object_get_property(plane, "ZPOS", &plane_state->zpos);
+
+    return ret;
+}
+
+static bool drm_atomic_restore_plane_state(drmModeAtomicReq *request,
+                                           struct drm_object *plane,
+                                           const struct drm_atomic_plane_state *plane_state)
+{
+    bool ret = true;
+
+    if (0 > drm_object_set_property(request, plane, "FB_ID", plane_state->fb_id))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "CRTC_ID", plane_state->crtc_id))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "SRC_X", plane_state->src_x))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "SRC_Y", plane_state->src_y))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "SRC_W", plane_state->src_w))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "SRC_H", plane_state->src_h))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "CRTC_X", plane_state->crtc_x))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "CRTC_Y", plane_state->crtc_y))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "CRTC_W", plane_state->crtc_w))
+        ret = false;
+    if (0 > drm_object_set_property(request, plane, "CRTC_H", plane_state->crtc_h))
+        ret = false;
+    // ZPOS might not exist, so ignore whether or not this succeeds
+    drm_object_set_property(request, plane, "ZPOS", plane_state->zpos);
+
+    return ret;
+}
+
+bool drm_atomic_save_old_state(struct drm_atomic_context *ctx)
+{
+    if (ctx->old_state.saved)
+        return false;
+
+    bool ret = true;
+
+    drmModeCrtc *crtc = drmModeGetCrtc(ctx->fd, ctx->crtc->id);
+    if (crtc == NULL)
+        return false;
+    ctx->old_state.crtc.mode.mode = crtc->mode;
+    drmModeFreeCrtc(crtc);
+
+    if (0 > drm_object_get_property(ctx->crtc, "ACTIVE", &ctx->old_state.crtc.active))
+        ret = false;
+
+    if (0 > drm_object_get_property(ctx->connector, "CRTC_ID", &ctx->old_state.connector.crtc_id))
+        ret = false;
+
+    if (!drm_atomic_save_plane_state(ctx->osd_plane, &ctx->old_state.osd_plane))
+        ret = false;
+    if (!drm_atomic_save_plane_state(ctx->video_plane, &ctx->old_state.video_plane))
+        ret = false;
+
+    ctx->old_state.saved = true;
+
+    return ret;
+}
+
+bool drm_atomic_restore_old_state(drmModeAtomicReqPtr request, struct drm_atomic_context *ctx)
+{
+    if (!ctx->old_state.saved)
+        return false;
+
+    bool ret = true;
+
+    if (0 > drm_object_set_property(request, ctx->connector, "CRTC_ID", ctx->old_state.connector.crtc_id))
+        ret = false;
+
+    if (!drm_mode_ensure_blob(ctx->fd, &ctx->old_state.crtc.mode))
+        ret = false;
+    if (0 > drm_object_set_property(request, ctx->crtc, "MODE_ID", ctx->old_state.crtc.mode.blob_id))
+        ret = false;
+    if (0 > drm_object_set_property(request, ctx->crtc, "ACTIVE", ctx->old_state.crtc.active))
+        ret = false;
+
+    if (!drm_atomic_restore_plane_state(request, ctx->osd_plane, &ctx->old_state.osd_plane))
+        ret = false;
+    if (!drm_atomic_restore_plane_state(request, ctx->video_plane, &ctx->old_state.video_plane))
+        ret = false;
+
+    ctx->old_state.saved = false;
+
+    return ret;
+}
+
+bool drm_mode_ensure_blob(int fd, struct drm_mode *mode)
+{
+    int ret = 0;
+
+    if (!mode->blob_id) {
+        ret = drmModeCreatePropertyBlob(fd, &mode->mode, sizeof(drmModeModeInfo),
+                                        &mode->blob_id);
+    }
+
+    return (ret == 0);
+}
+
+bool drm_mode_destroy_blob(int fd, struct drm_mode *mode)
+{
+    int ret = 0;
+
+    if (mode->blob_id) {
+        ret = drmModeDestroyPropertyBlob(fd, mode->blob_id);
+        mode->blob_id = 0;
+    }
+
+    return (ret == 0);
+}
diff --git a/video/out/drm_atomic.h b/video/out/drm_atomic.h
index d0ebdb9..cd0252a 100644
--- a/video/out/drm_atomic.h
+++ b/video/out/drm_atomic.h
@@ -19,12 +19,47 @@
 #define MP_DRMATOMIC_H
 
 #include <stdlib.h>
+#include <stdbool.h>
 #include <xf86drm.h>
 #include <xf86drmMode.h>
 
 #include "common/msg.h"
 
+struct drm_mode {
+    drmModeModeInfo mode;
+    uint32_t blob_id;
+};
+
+struct drm_atomic_plane_state {
+    uint64_t fb_id;
+    uint64_t crtc_id;
+    uint64_t src_x;
+    uint64_t src_y;
+    uint64_t src_w;
+    uint64_t src_h;
+    uint64_t crtc_x;
+    uint64_t crtc_y;
+    uint64_t crtc_w;
+    uint64_t crtc_h;
+    uint64_t zpos;
+};
+
+// Used to store the restore state for VT switching and uninit
+struct drm_atomic_state {
+    bool saved;
+    struct {
+        uint64_t crtc_id;
+    } connector;
+    struct {
+        struct drm_mode mode;
+        uint64_t active;
+    } crtc;
+    struct drm_atomic_plane_state osd_plane;
+    struct drm_atomic_plane_state video_plane;
+};
+
 struct drm_object {
+    int fd;
     uint32_t id;
     uint32_t type;
     drmModeObjectProperties *props;
@@ -35,10 +70,13 @@ struct drm_atomic_context {
     int fd;
 
     struct drm_object *crtc;
-    struct drm_object *primary_plane;
-    struct drm_object *overlay_plane;
+    struct drm_object *connector;
+    struct drm_object *osd_plane;
+    struct drm_object *video_plane;
 
     drmModeAtomicReq *request;
+
+    struct drm_atomic_state old_state;
 };
 
 
@@ -46,10 +84,18 @@ int drm_object_create_properties(struct mp_log *log, int fd, struct drm_object *
 void drm_object_free_properties(struct drm_object *object);
 int drm_object_get_property(struct drm_object *object, char *name, uint64_t *value);
 int drm_object_set_property(drmModeAtomicReq *request, struct drm_object *object, char *name, uint64_t value);
+drmModePropertyBlobPtr drm_object_get_property_blob(struct drm_object *object, char *name);
 struct drm_object * drm_object_create(struct mp_log *log, int fd, uint32_t object_id, uint32_t type);
 void drm_object_free(struct drm_object *object);
 void drm_object_print_info(struct mp_log *log, struct drm_object *object);
-struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd, int crtc_id, int overlay_id);
+struct drm_atomic_context *drm_atomic_create_context(struct mp_log *log, int fd, int crtc_id, int connector_id,
+													 int osd_plane_id, int video_plane_id);
 void drm_atomic_destroy_context(struct drm_atomic_context *ctx);
 
+bool drm_atomic_save_old_state(struct drm_atomic_context *ctx);
+bool drm_atomic_restore_old_state(drmModeAtomicReq *request, struct drm_atomic_context *ctx);
+
+bool drm_mode_ensure_blob(int fd, struct drm_mode *mode);
+bool drm_mode_destroy_blob(int fd, struct drm_mode *mode);
+
 #endif // MP_DRMATOMIC_H
diff --git a/video/out/drm_common.c b/video/out/drm_common.c
index 8402ac7..aa3d099 100644
--- a/video/out/drm_common.c
+++ b/video/out/drm_common.c
@@ -47,9 +47,18 @@ const struct m_sub_options drm_conf = {
         OPT_STRING_VALIDATE("drm-connector", drm_connector_spec,
                             0, drm_validate_connector_opt),
         OPT_INT("drm-mode", drm_mode_id, 0),
-        OPT_INT("drm-overlay", drm_overlay_id, 0),
+        OPT_INT("drm-osd-plane-id", drm_osd_plane_id, 0),
+        OPT_INT("drm-video-plane-id", drm_video_plane_id, 0),
+        OPT_CHOICE("drm-format", drm_format, 0,
+                   ({"xrgb8888",    DRM_OPTS_FORMAT_XRGB8888},
+                    {"xrgb2101010", DRM_OPTS_FORMAT_XRGB2101010})),
+        OPT_SIZE_BOX("drm-osd-size", drm_osd_size, 0),
         {0},
     },
+    .defaults = &(const struct drm_opts) {
+        .drm_osd_plane_id = -1,
+        .drm_video_plane_id = -1,
+    },
     .size = sizeof(struct drm_opts),
 };
 
@@ -164,6 +173,27 @@ static bool setup_connector(struct kms *kms, const drmModeRes *res,
 
 static bool setup_crtc(struct kms *kms, const drmModeRes *res)
 {
+    // First try to find currently connected encoder and its current CRTC
+    for (unsigned int i = 0; i < res->count_encoders; i++) {
+        drmModeEncoder *encoder = drmModeGetEncoder(kms->fd, res->encoders[i]);
+        if (!encoder) {
+            MP_WARN(kms, "Cannot retrieve encoder %u:%u: %s\n",
+                    i, res->encoders[i], mp_strerror(errno));
+            continue;
+        }
+
+        if (encoder->encoder_id == kms->connector->encoder_id && encoder->crtc_id != 0) {
+            MP_VERBOSE(kms, "Connector %u currently connected to encoder %u\n",
+                       kms->connector->connector_id, kms->connector->encoder_id);
+            kms->encoder = encoder;
+            kms->crtc_id = encoder->crtc_id;
+            goto success;
+        }
+
+        drmModeFreeEncoder(encoder);
+    }
+
+    // Otherwise pick first legal encoder and CRTC combo for the connector
     for (unsigned int i = 0; i < kms->connector->count_encoders; ++i) {
         drmModeEncoder *encoder
             = drmModeGetEncoder(kms->fd, kms->connector->encoders[i]);
@@ -181,7 +211,7 @@ static bool setup_crtc(struct kms *kms, const drmModeRes *res)
 
             kms->encoder = encoder;
             kms->crtc_id = res->crtcs[j];
-            return true;
+            goto success;
         }
 
         drmModeFreeEncoder(encoder);
@@ -190,6 +220,11 @@ static bool setup_crtc(struct kms *kms, const drmModeRes *res)
     MP_ERR(kms, "Connector %u has no suitable CRTC\n",
            kms->connector->connector_id);
     return false;
+
+  success:
+    MP_VERBOSE(kms, "Selected Encoder %u with CRTC %u\n",
+               kms->encoder->encoder_id, kms->crtc_id);
+    return true;
 }
 
 static bool setup_mode(struct kms *kms, int mode_id)
@@ -202,7 +237,7 @@ static bool setup_mode(struct kms *kms, int mode_id)
         return false;
     }
 
-    kms->mode = kms->connector->modes[mode_id];
+    kms->mode.mode = kms->connector->modes[mode_id];
     return true;
 }
 
@@ -234,7 +269,7 @@ static void parse_connector_spec(struct mp_log *log,
 
 
 struct kms *kms_create(struct mp_log *log, const char *connector_spec,
-                       int mode_id, int overlay_id)
+                       int mode_id, int osd_plane_id, int video_plane_id)
 {
     int card_no = -1;
     char *connector_name = NULL;
@@ -246,7 +281,7 @@ struct kms *kms_create(struct mp_log *log, const char *connector_spec,
         .fd = open_card(card_no),
         .connector = NULL,
         .encoder = NULL,
-        .mode = { 0 },
+        .mode = {{0}},
         .crtc_id = -1,
         .card_no = card_no,
     };
@@ -281,14 +316,14 @@ struct kms *kms_create(struct mp_log *log, const char *connector_spec,
         mp_verbose(log, "No DRM Atomic support found\n");
     } else {
         mp_verbose(log, "DRM Atomic support found\n");
-        kms->atomic_context = drm_atomic_create_context(kms->log, kms->fd, kms->crtc_id, overlay_id);
+        kms->atomic_context = drm_atomic_create_context(kms->log, kms->fd, kms->crtc_id,
+                                                        kms->connector->connector_id, osd_plane_id, video_plane_id);
         if (!kms->atomic_context) {
             mp_err(log, "Failed to create DRM atomic context\n");
             goto err;
         }
     }
 
-
     drmModeFreeResources(res);
     return kms;
 
@@ -305,6 +340,7 @@ void kms_destroy(struct kms *kms)
 {
     if (!kms)
         return;
+    drm_mode_destroy_blob(kms->fd, &kms->mode);
     if (kms->connector) {
         drmModeFreeConnector(kms->connector);
         kms->connector = NULL;
@@ -389,7 +425,7 @@ void kms_show_available_cards_and_connectors(struct mp_log *log)
 
 double kms_get_display_fps(const struct kms *kms)
 {
-    return mode_get_Hz(&kms->mode);
+    return mode_get_Hz(&kms->mode.mode);
 }
 
 int drm_validate_connector_opt(struct mp_log *log, const struct m_option *opt,
@@ -428,7 +464,6 @@ static int install_signal(int signo, void (*handler)(int))
     return sigaction(signo, &act, NULL);
 }
 
-
 bool vt_switcher_init(struct vt_switcher *s, struct mp_log *log)
 {
     s->log = log;
@@ -479,6 +514,14 @@ bool vt_switcher_init(struct vt_switcher *s, struct mp_log *log)
         return false;
     }
 
+    // Block the VT switching signals from interrupting the VO thread (they will
+    // still be picked up by other threads, which will fill vt_switcher_pipe for us)
+    sigset_t set;
+    sigemptyset(&set);
+    sigaddset(&set, RELEASE_SIGNAL);
+    sigaddset(&set, ACQUIRE_SIGNAL);
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
     return true;
 }
 
@@ -504,6 +547,13 @@ void vt_switcher_interrupt_poll(struct vt_switcher *s)
 
 void vt_switcher_destroy(struct vt_switcher *s)
 {
+    struct vt_mode vt_mode = {0};
+    vt_mode.mode = VT_AUTO;
+    if (ioctl(s->tty_fd, VT_SETMODE, &vt_mode) < 0) {
+        MP_ERR(s, "VT_SETMODE failed: %s\n", mp_strerror(errno));
+        return;
+    }
+
     install_signal(RELEASE_SIGNAL, SIG_DFL);
     install_signal(ACQUIRE_SIGNAL, SIG_DFL);
     close(s->tty_fd);
diff --git a/video/out/drm_common.h b/video/out/drm_common.h
index ff913ff..3f14410 100644
--- a/video/out/drm_common.h
+++ b/video/out/drm_common.h
@@ -24,12 +24,15 @@
 #include "options/m_option.h"
 #include "drm_atomic.h"
 
+#define DRM_OPTS_FORMAT_XRGB8888    0
+#define DRM_OPTS_FORMAT_XRGB2101010 1
+
 struct kms {
     struct mp_log *log;
     int fd;
     drmModeConnector *connector;
     drmModeEncoder *encoder;
-    drmModeModeInfo mode;
+    struct drm_mode mode;
     uint32_t crtc_id;
     int card_no;
     struct drm_atomic_context *atomic_context;
@@ -45,7 +48,10 @@ struct vt_switcher {
 struct drm_opts {
     char *drm_connector_spec;
     int drm_mode_id;
-    int drm_overlay_id;
+    int drm_osd_plane_id;
+    int drm_video_plane_id;
+    int drm_format;
+    struct m_geometry drm_osd_size;
 };
 
 bool vt_switcher_init(struct vt_switcher *s, struct mp_log *log);
@@ -59,7 +65,7 @@ void vt_switcher_release(struct vt_switcher *s, void (*handler)(void*),
                          void *user_data);
 
 struct kms *kms_create(struct mp_log *log, const char *connector_spec,
-                       int mode_id, int overlay_id);
+                       int mode_id, int osd_plane_id, int video_plane_id);
 void kms_destroy(struct kms *kms);
 double kms_get_display_fps(const struct kms *kms);
 
diff --git a/video/out/gpu/context.h b/video/out/gpu/context.h
index 78c0441..a2fcb37 100644
--- a/video/out/gpu/context.h
+++ b/video/out/gpu/context.h
@@ -69,9 +69,6 @@ struct ra_swapchain_fns {
     // Gets the current framebuffer depth in bits (0 if unknown). Optional.
     int (*color_depth)(struct ra_swapchain *sw);
 
-    // Retrieves a screenshot of the framebuffer. Optional.
-    struct mp_image *(*screenshot)(struct ra_swapchain *sw);
-
     // Called when rendering starts. Returns NULL on failure. This must be
     // followed by submit_frame, to submit the rendered frame. This function
     // can also fail sporadically, and such errors should be ignored unless
diff --git a/video/out/gpu/d3d11_helpers.c b/video/out/gpu/d3d11_helpers.c
index b96b03a..d267ac3 100644
--- a/video/out/gpu/d3d11_helpers.c
+++ b/video/out/gpu/d3d11_helpers.c
@@ -315,29 +315,18 @@ bool mp_d3d11_create_swapchain(ID3D11Device *dev, struct mp_log *log,
     if (FAILED(hr))
         factory2 = NULL;
 
-    // Try B8G8R8A8_UNORM first, since at least in Windows 8, it's always the
-    // format of the desktop image
-    static const DXGI_FORMAT formats[] = {
-        DXGI_FORMAT_B8G8R8A8_UNORM,
-        DXGI_FORMAT_R8G8B8A8_UNORM,
-    };
-    static const int formats_len = MP_ARRAY_SIZE(formats);
     bool flip = factory2 && opts->flip;
 
     // Return here to retry creating the swapchain
     do {
-        for (int i = 0; i < formats_len; i++) {
-            if (factory2) {
-                // Create a DXGI 1.2+ (Windows 8+) swap chain if possible
-                hr = create_swapchain_1_2(dev, factory2, log, opts, flip,
-                                          formats[i], &swapchain);
-            } else {
-                // Fall back to DXGI 1.1 (Windows 7)
-                hr = create_swapchain_1_1(dev, factory, log, opts, formats[i],
-                                          &swapchain);
-            }
-            if (SUCCEEDED(hr))
-                break;
+        if (factory2) {
+            // Create a DXGI 1.2+ (Windows 8+) swap chain if possible
+            hr = create_swapchain_1_2(dev, factory2, log, opts, flip,
+                                      DXGI_FORMAT_R8G8B8A8_UNORM, &swapchain);
+        } else {
+            // Fall back to DXGI 1.1 (Windows 7)
+            hr = create_swapchain_1_1(dev, factory, log, opts,
+                                      DXGI_FORMAT_R8G8B8A8_UNORM, &swapchain);
         }
         if (SUCCEEDED(hr))
             break;
@@ -385,84 +374,3 @@ done:
     SAFE_RELEASE(dxgi_dev);
     return success;
 }
-
-struct mp_image *mp_d3d11_screenshot(IDXGISwapChain *swapchain)
-{
-    ID3D11Device *dev = NULL;
-    ID3D11DeviceContext *ctx = NULL;
-    ID3D11Texture2D *frontbuffer = NULL;
-    ID3D11Texture2D *staging = NULL;
-    struct mp_image *img = NULL;
-    HRESULT hr;
-
-    // Validate the swap chain. This screenshot method will only work on DXGI
-    // 1.2+ flip/sequential swap chains. It's probably not possible at all with
-    // discard swap chains, since by definition, the backbuffer contents is
-    // discarded on Present().
-    DXGI_SWAP_CHAIN_DESC scd;
-    hr = IDXGISwapChain_GetDesc(swapchain, &scd);
-    if (FAILED(hr))
-        goto done;
-    if (scd.SwapEffect != DXGI_SWAP_EFFECT_FLIP_SEQUENTIAL)
-        goto done;
-
-    // Get the last buffer that was presented with Present(). This should be
-    // the n-1th buffer for a swap chain of length n.
-    hr = IDXGISwapChain_GetBuffer(swapchain, scd.BufferCount - 1,
-        &IID_ID3D11Texture2D, (void**)&frontbuffer);
-    if (FAILED(hr))
-        goto done;
-
-    ID3D11Texture2D_GetDevice(frontbuffer, &dev);
-    ID3D11Device_GetImmediateContext(dev, &ctx);
-
-    D3D11_TEXTURE2D_DESC td;
-    ID3D11Texture2D_GetDesc(frontbuffer, &td);
-    if (td.SampleDesc.Count > 1)
-        goto done;
-
-    // Validate the backbuffer format and convert to an mpv IMGFMT
-    enum mp_imgfmt fmt;
-    switch (td.Format) {
-    case DXGI_FORMAT_B8G8R8A8_UNORM: fmt = IMGFMT_BGR0; break;
-    case DXGI_FORMAT_R8G8B8A8_UNORM: fmt = IMGFMT_RGB0; break;
-    default:
-        goto done;
-    }
-
-    // Create a staging texture based on the frontbuffer with CPU access
-    td.BindFlags = 0;
-    td.MiscFlags = 0;
-    td.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
-    td.Usage = D3D11_USAGE_STAGING;
-    hr = ID3D11Device_CreateTexture2D(dev, &td, 0, &staging);
-    if (FAILED(hr))
-        goto done;
-
-    ID3D11DeviceContext_CopyResource(ctx, (ID3D11Resource*)staging,
-        (ID3D11Resource*)frontbuffer);
-
-    // Attempt to map the staging texture to CPU-accessible memory
-    D3D11_MAPPED_SUBRESOURCE lock;
-    hr = ID3D11DeviceContext_Map(ctx, (ID3D11Resource*)staging, 0,
-                                 D3D11_MAP_READ, 0, &lock);
-    if (FAILED(hr))
-        goto done;
-
-    img = mp_image_alloc(fmt, td.Width, td.Height);
-    if (!img)
-        return NULL;
-    for (int i = 0; i < td.Height; i++) {
-        memcpy(img->planes[0] + img->stride[0] * i,
-               (char*)lock.pData + lock.RowPitch * i, td.Width * 4);
-    }
-
-    ID3D11DeviceContext_Unmap(ctx, (ID3D11Resource*)staging, 0);
-
-done:
-    SAFE_RELEASE(frontbuffer);
-    SAFE_RELEASE(staging);
-    SAFE_RELEASE(ctx);
-    SAFE_RELEASE(dev);
-    return img;
-}
diff --git a/video/out/gpu/d3d11_helpers.h b/video/out/gpu/d3d11_helpers.h
index 481c183..996b934 100644
--- a/video/out/gpu/d3d11_helpers.h
+++ b/video/out/gpu/d3d11_helpers.h
@@ -78,6 +78,4 @@ bool mp_d3d11_create_swapchain(ID3D11Device *dev, struct mp_log *log,
                                struct d3d11_swapchain_opts *opts,
                                IDXGISwapChain **swapchain_out);
 
-struct mp_image *mp_d3d11_screenshot(IDXGISwapChain *swapchain);
-
 #endif
diff --git a/video/out/gpu/hwdec.c b/video/out/gpu/hwdec.c
index 5284116..fc37074 100644
--- a/video/out/gpu/hwdec.c
+++ b/video/out/gpu/hwdec.c
@@ -35,6 +35,7 @@ extern const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb;
 extern const struct ra_hwdec_driver ra_hwdec_dxva2gldx;
 extern const struct ra_hwdec_driver ra_hwdec_dxva2;
 extern const struct ra_hwdec_driver ra_hwdec_d3d11va;
+extern const struct ra_hwdec_driver ra_hwdec_dxva2dxgi;
 extern const struct ra_hwdec_driver ra_hwdec_cuda;
 extern const struct ra_hwdec_driver ra_hwdec_cuda_nvdec;
 extern const struct ra_hwdec_driver ra_hwdec_rpi_overlay;
@@ -48,13 +49,18 @@ const struct ra_hwdec_driver *const ra_hwdec_drivers[] = {
     &ra_hwdec_videotoolbox,
 #endif
 #if HAVE_D3D_HWACCEL
+ #if HAVE_EGL_ANGLE
     &ra_hwdec_d3d11egl,
     &ra_hwdec_d3d11eglrgb,
- #if HAVE_D3D9_HWACCEL
+  #if HAVE_D3D9_HWACCEL
     &ra_hwdec_dxva2egl,
+  #endif
  #endif
  #if HAVE_D3D11
     &ra_hwdec_d3d11va,
+  #if HAVE_D3D9_HWACCEL
+    &ra_hwdec_dxva2dxgi,
+  #endif
  #endif
 #endif
 #if HAVE_GL_DXINTEROP_D3D9
diff --git a/video/out/gpu/lcms.c b/video/out/gpu/lcms.c
index 3552351..bc76db9 100644
--- a/video/out/gpu/lcms.c
+++ b/video/out/gpu/lcms.c
@@ -83,7 +83,7 @@ const struct m_sub_options mp_icc_conf = {
         OPT_FLAG("icc-profile-auto", profile_auto, 0),
         OPT_STRING("icc-cache-dir", cache_dir, M_OPT_FILE),
         OPT_INT("icc-intent", intent, 0),
-        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 100000),
+        OPT_INTRANGE("icc-contrast", contrast, 0, 0, 1000000),
         OPT_STRING_VALIDATE("icc-3dlut-size", size_str, 0, validate_3dlut_size_opt),
 
         OPT_REPLACED("3dlut-size", "icc-3dlut-size"),
@@ -304,7 +304,8 @@ static cmsHPROFILE get_vid_profile(struct gl_lcms *p, cmsContext cms,
 
         // Built-in contrast failsafe
         double contrast = 3.0 / (src_black[0] + src_black[1] + src_black[2]);
-        if (contrast > 100000) {
+        MP_VERBOSE(p, "Detected ICC profile contrast: %f\n", contrast);
+        if (contrast > 100000 && !p->opts->contrast) {
             MP_WARN(p, "ICC profile detected contrast very high (>100000),"
                     " falling back to contrast 1000 for sanity. Set the"
                     " icc-contrast option to silence this warning.\n");
diff --git a/video/out/gpu/libmpv_gpu.c b/video/out/gpu/libmpv_gpu.c
new file mode 100644
index 0000000..fce2acf
--- /dev/null
+++ b/video/out/gpu/libmpv_gpu.c
@@ -0,0 +1,239 @@
+#include "config.h"
+#include "hwdec.h"
+#include "libmpv_gpu.h"
+#include "libmpv/render_gl.h"
+#include "video.h"
+#include "video/out/libmpv.h"
+
+static const struct libmpv_gpu_context_fns *context_backends[] = {
+#if HAVE_GL
+    &libmpv_gpu_context_gl,
+#endif
+    NULL
+};
+
+struct priv {
+    struct libmpv_gpu_context *context;
+
+    struct gl_video *renderer;
+};
+
+struct native_resource_entry {
+    const char *name;   // ra_add_native_resource() internal name argument
+    size_t size;        // size of struct pointed to (0 for no copy)
+};
+
+static const struct native_resource_entry native_resource_map[] = {
+    [MPV_RENDER_PARAM_X11_DISPLAY] = {
+        .name = "x11",
+        .size = 0,
+    },
+    [MPV_RENDER_PARAM_WL_DISPLAY] = {
+        .name = "wl",
+        .size = 0,
+    },
+    [MPV_RENDER_PARAM_DRM_DISPLAY] = {
+        .name = "drm_params",
+        .size = sizeof (mpv_opengl_drm_params),
+    },
+    [MPV_RENDER_PARAM_DRM_OSD_SIZE] = {
+        .name = "drm_osd_size",
+        .size = sizeof (mpv_opengl_drm_osd_size),
+    },
+};
+
+static int init(struct render_backend *ctx, mpv_render_param *params)
+{
+    ctx->priv = talloc_zero(NULL, struct priv);
+    struct priv *p = ctx->priv;
+
+    char *api = get_mpv_render_param(params, MPV_RENDER_PARAM_API_TYPE, NULL);
+    if (!api)
+        return MPV_ERROR_INVALID_PARAMETER;
+
+    for (int n = 0; context_backends[n]; n++) {
+        const struct libmpv_gpu_context_fns *backend = context_backends[n];
+        if (strcmp(backend->api_name, api) == 0) {
+            p->context = talloc_zero(NULL, struct libmpv_gpu_context);
+            *p->context = (struct libmpv_gpu_context){
+                .global = ctx->global,
+                .log = ctx->log,
+                .fns = backend,
+            };
+            break;
+        }
+    }
+
+    if (!p->context)
+        return MPV_ERROR_INVALID_PARAMETER;
+
+    int err = p->context->fns->init(p->context, params);
+    if (err < 0)
+        return err;
+
+    for (int n = 0; params && params[n].type; n++) {
+        if (params[n].type > 0 &&
+            params[n].type < MP_ARRAY_SIZE(native_resource_map) &&
+            native_resource_map[params[n].type].name)
+        {
+            const struct native_resource_entry *entry =
+                &native_resource_map[params[n].type];
+            void *data = params[n].data;
+            if (entry->size)
+                data = talloc_memdup(p, data, entry->size);
+            ra_add_native_resource(p->context->ra, entry->name, data);
+        }
+    }
+
+    p->renderer = gl_video_init(p->context->ra, ctx->log, ctx->global);
+
+    ctx->hwdec_devs = hwdec_devices_create();
+    gl_video_load_hwdecs(p->renderer, ctx->hwdec_devs, true);
+    ctx->driver_caps = VO_CAP_ROTATE90;
+    return 0;
+}
+
+static bool check_format(struct render_backend *ctx, int imgfmt)
+{
+    struct priv *p = ctx->priv;
+
+    return gl_video_check_format(p->renderer, imgfmt);
+}
+
+static int set_parameter(struct render_backend *ctx, mpv_render_param param)
+{
+    struct priv *p = ctx->priv;
+
+    switch (param.type) {
+    case MPV_RENDER_PARAM_ICC_PROFILE: {
+        mpv_byte_array *data = param.data;
+        gl_video_set_icc_profile(p->renderer, (bstr){data->data, data->size});
+        return 0;
+    }
+    case MPV_RENDER_PARAM_AMBIENT_LIGHT: {
+        int lux = *(int *)param.data;
+        gl_video_set_ambient_lux(p->renderer, lux);
+        return 0;
+    }
+    default:
+        return MPV_ERROR_NOT_IMPLEMENTED;
+    }
+}
+
+static void reconfig(struct render_backend *ctx, struct mp_image_params *params)
+{
+    struct priv *p = ctx->priv;
+
+    gl_video_config(p->renderer, params);
+}
+
+static void reset(struct render_backend *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    gl_video_reset(p->renderer);
+}
+
+static void update_external(struct render_backend *ctx, struct vo *vo)
+{
+    struct priv *p = ctx->priv;
+
+    gl_video_set_osd_source(p->renderer, vo ? vo->osd : NULL);
+    if (vo)
+        gl_video_configure_queue(p->renderer, vo);
+}
+
+static void resize(struct render_backend *ctx, struct mp_rect *src,
+                   struct mp_rect *dst, struct mp_osd_res *osd)
+{
+    struct priv *p = ctx->priv;
+
+    gl_video_resize(p->renderer, src, dst, osd);
+}
+
+static int get_target_size(struct render_backend *ctx, mpv_render_param *params,
+                           int *out_w, int *out_h)
+{
+    struct priv *p = ctx->priv;
+
+    // Mapping the surface is cheap, better than adding new backend entrypoints.
+    struct ra_tex *tex;
+    int err = p->context->fns->wrap_fbo(p->context, params, &tex);
+    if (err < 0)
+        return err;
+    *out_w = tex->params.w;
+    *out_h = tex->params.h;
+    return 0;
+}
+
+static int render(struct render_backend *ctx, mpv_render_param *params,
+                  struct vo_frame *frame)
+{
+    struct priv *p = ctx->priv;
+
+    // Mapping the surface is cheap, better than adding new backend entrypoints.
+    struct ra_tex *tex;
+    int err = p->context->fns->wrap_fbo(p->context, params, &tex);
+    if (err < 0)
+        return err;
+
+    int depth = *(int *)get_mpv_render_param(params, MPV_RENDER_PARAM_DEPTH,
+                                             &(int){0});
+    gl_video_set_fb_depth(p->renderer, depth);
+
+    bool flip = *(int *)get_mpv_render_param(params, MPV_RENDER_PARAM_FLIP_Y,
+                                             &(int){0});
+
+    struct ra_fbo target = {.tex = tex, .flip = flip};
+    gl_video_render_frame(p->renderer, frame, target, RENDER_FRAME_DEF);
+    p->context->fns->done_frame(p->context, frame->display_synced);
+
+    return 0;
+}
+
+static struct mp_image *get_image(struct render_backend *ctx, int imgfmt,
+                                  int w, int h, int stride_align)
+{
+    struct priv *p = ctx->priv;
+
+    return gl_video_get_image(p->renderer, imgfmt, w, h, stride_align);
+}
+
+static void screenshot(struct render_backend *ctx, struct vo_frame *frame,
+                       struct voctrl_screenshot *args)
+{
+    struct priv *p = ctx->priv;
+
+    gl_video_screenshot(p->renderer, frame, args);
+}
+
+static void destroy(struct render_backend *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    if (p->renderer)
+        gl_video_uninit(p->renderer);
+
+    hwdec_devices_destroy(ctx->hwdec_devs);
+
+    if (p->context) {
+        p->context->fns->destroy(p->context);
+        talloc_free(p->context->priv);
+        talloc_free(p->context);
+    }
+}
+
+const struct render_backend_fns render_backend_gpu = {
+    .init = init,
+    .check_format = check_format,
+    .set_parameter = set_parameter,
+    .reconfig = reconfig,
+    .reset = reset,
+    .update_external = update_external,
+    .resize = resize,
+    .get_target_size = get_target_size,
+    .render = render,
+    .get_image = get_image,
+    .screenshot = screenshot,
+    .destroy = destroy,
+};
diff --git a/video/out/gpu/libmpv_gpu.h b/video/out/gpu/libmpv_gpu.h
new file mode 100644
index 0000000..2c9f712
--- /dev/null
+++ b/video/out/gpu/libmpv_gpu.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include "video/out/libmpv.h"
+
+struct ra_tex;
+
+struct libmpv_gpu_context {
+    struct mpv_global *global;
+    struct mp_log *log;
+    const struct libmpv_gpu_context_fns *fns;
+
+    struct ra *ra;
+    void *priv;
+};
+
+// Manage backend specific interaction between libmpv and ra backend, that can't
+// be managed by ra itself (initialization and passing FBOs).
+struct libmpv_gpu_context_fns {
+    // The libmpv API type name, see MPV_RENDER_PARAM_API_TYPE.
+    const char *api_name;
+    // Pretty much works like render_backend_fns.init, except that the
+    // API type is already checked by the caller.
+    // Successful init must set ctx->ra.
+    int (*init)(struct libmpv_gpu_context *ctx, mpv_render_param *params);
+    // Wrap the surface passed to mpv_render_context_render() (via the params
+    // array) into a ra_tex and return it. Returns a libmpv error code, and sets
+    // *out to a temporary object on success. The returned object is valid until
+    // another wrap_fbo() or done_frame() is called.
+    // This does not need to care about generic attributes, like flipping.
+    int (*wrap_fbo)(struct libmpv_gpu_context *ctx, mpv_render_param *params,
+                    struct ra_tex **out);
+    // Signal that the ra_tex object obtained with wrap_fbo is no longer used.
+    // For certain backends, this might also be used to signal the end of
+    // rendering (like OpenGL doing weird crap).
+    void (*done_frame)(struct libmpv_gpu_context *ctx, bool ds);
+    // Free all data in ctx->priv.
+    void (*destroy)(struct libmpv_gpu_context *ctx);
+};
+
+extern const struct libmpv_gpu_context_fns libmpv_gpu_context_gl;
diff --git a/video/out/gpu/osd.c b/video/out/gpu/osd.c
index 317deb6..75f69f0 100644
--- a/video/out/gpu/osd.c
+++ b/video/out/gpu/osd.c
@@ -314,7 +314,7 @@ void mpgl_osd_draw_finish(struct mpgl_osd *ctx, int index,
     const int *factors = &blend_factors[part->format][0];
     gl_sc_blend(sc, factors[0], factors[1], factors[2], factors[3]);
 
-    gl_sc_dispatch_draw(sc, fbo.tex, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
+    gl_sc_dispatch_draw(sc, fbo.tex, false, vertex_vao, MP_ARRAY_SIZE(vertex_vao),
                         sizeof(struct vertex), part->vertices, part->num_vertices);
 }
 
diff --git a/video/out/gpu/ra.c b/video/out/gpu/ra.c
index fdb20fe..0c15651 100644
--- a/video/out/gpu/ra.c
+++ b/video/out/gpu/ra.c
@@ -4,6 +4,26 @@
 
 #include "ra.h"
 
+void ra_add_native_resource(struct ra *ra, const char *name, void *data)
+{
+    struct ra_native_resource r = {
+        .name = name,
+        .data = data,
+    };
+    MP_TARRAY_APPEND(ra, ra->native_resources, ra->num_native_resources, r);
+}
+
+void *ra_get_native_resource(struct ra *ra, const char *name)
+{
+    for (int n = 0; n < ra->num_native_resources; n++) {
+        struct ra_native_resource *r = &ra->native_resources[n];
+        if (strcmp(r->name, name) == 0)
+            return r->data;
+    }
+
+    return NULL;
+}
+
 struct ra_tex *ra_tex_create(struct ra *ra, const struct ra_tex_params *params)
 {
     return ra->fns->tex_create(ra, params);
diff --git a/video/out/gpu/ra.h b/video/out/gpu/ra.h
index 934e5db..79caacc 100644
--- a/video/out/gpu/ra.h
+++ b/video/out/gpu/ra.h
@@ -39,8 +39,29 @@ struct ra {
     // RA_CAP_DIRECT_UPLOAD is supported. This is basically only relevant for
     // OpenGL. Set by the RA user.
     bool use_pbo;
+
+    // Array of native resources. For the most part an "escape" mechanism, and
+    // usually does not contain parameters required for basic functionality.
+    struct ra_native_resource *native_resources;
+    int num_native_resources;
+};
+
+// For passing through windowing system specific parameters and such. The
+// names are always internal (except for legacy opengl-cb uses; the libmpv
+// render API uses mpv_render_param_type and maps them to names internally).
+// For example, a name="x11" entry has a X11 display as (Display*)data.
+struct ra_native_resource {
+    const char *name;
+    void *data;
 };
 
+// Add a ra_native_resource entry. Both name and data pointers must stay valid
+// until ra termination.
+void ra_add_native_resource(struct ra *ra, const char *name, void *data);
+
+// Search ra->native_resources, returns NULL on failure.
+void *ra_get_native_resource(struct ra *ra, const char *name);
+
 enum {
     RA_CAP_TEX_1D         = 1 << 0, // supports 1D textures (as shader inputs)
     RA_CAP_TEX_3D         = 1 << 1, // supports 3D textures (as shader inputs)
@@ -53,6 +74,8 @@ enum {
     RA_CAP_GLOBAL_UNIFORM = 1 << 8, // supports using "naked" uniforms (not UBO)
     RA_CAP_GATHER         = 1 << 9, // supports textureGather in GLSL
     RA_CAP_FRAGCOORD      = 1 << 10, // supports reading from gl_FragCoord
+    RA_CAP_PARALLEL_COMPUTE  = 1 << 11, // supports parallel compute shaders
+    RA_CAP_NUM_GROUPS     = 1 << 12, // supports gl_NumWorkGroups
 };
 
 enum ra_ctype {
@@ -84,6 +107,8 @@ struct ra_format {
                             // only applies to 2-component textures
     bool linear_filter;     // linear filtering available from shader
     bool renderable;        // can be used for render targets
+    bool dummy_format;      // is not a real ra_format but a fake one (e.g. FBO).
+                            // dummy formats cannot be used to create textures
 
     // If not 0, the format represents some sort of packed fringe format, whose
     // shader representation is given by the special_imgfmt_desc pointer.
@@ -106,6 +131,7 @@ struct ra_tex_params {
     bool blit_src;          // must be usable as a blit source
     bool blit_dst;          // must be usable as a blit destination
     bool host_mutable;      // texture may be updated with tex_upload
+    bool downloadable;      // texture can be read with tex_download
     // When used as render source texture.
     bool src_linear;        // if false, use nearest sampling (whether this can
                             // be true depends on ra_format.linear_filter)
@@ -147,6 +173,13 @@ struct ra_tex_upload_params {
     ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
 };
 
+struct ra_tex_download_params {
+    struct ra_tex *tex; // Texture to download from
+    // Downloading directly (set by caller, data written to by callee):
+    void *dst;          // Address of data (packed with no alignment)
+    ptrdiff_t stride;   // The size of a horizontal line in bytes (*not* texels!)
+};
+
 // Buffer usage type. This restricts what types of operations may be performed
 // on a buffer.
 enum ra_buf_type {
@@ -285,6 +318,9 @@ struct ra_renderpass_params {
     enum ra_blend blend_src_alpha;
     enum ra_blend blend_dst_alpha;
 
+    // If true, the contents of `target` not written to will become undefined
+    bool invalidate_target;
+
     // --- type==RA_RENDERPASS_TYPE_COMPUTE only
 
     // Shader text, like vertex_shader/frag_shader.
@@ -372,6 +408,10 @@ struct ra_fns {
     // Returns whether successful.
     bool (*tex_upload)(struct ra *ra, const struct ra_tex_upload_params *params);
 
+    // Copy data from the texture to memory. ra_tex_params.downloadable must
+    // have been set to true on texture creation.
+    bool (*tex_download)(struct ra *ra, struct ra_tex_download_params *params);
+
     // Create a buffer. This can be used as a persistently mapped buffer,
     // a uniform buffer, a shader storage buffer or possibly others.
     // Not all usage types must be supported; may return NULL if unavailable.
diff --git a/video/out/gpu/shader_cache.c b/video/out/gpu/shader_cache.c
index 6d0f370..f38f0a4 100644
--- a/video/out/gpu/shader_cache.c
+++ b/video/out/gpu/shader_cache.c
@@ -777,11 +777,6 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
         ADD(header, "#define texture texture2D\n");
     }
 
-    if (sc->ra->glsl_vulkan && type == RA_RENDERPASS_TYPE_COMPUTE) {
-        ADD(header, "#define gl_GlobalInvocationIndex "
-                    "(gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID)\n");
-    }
-
     // Additional helpers.
     ADD(header, "#define LUT_POS(x, lut_size)"
                 " mix(0.5 / (lut_size), 1.0 - 0.5 / (lut_size), (x))\n");
@@ -965,13 +960,14 @@ static void gl_sc_generate(struct gl_shader_cache *sc,
 }
 
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
+                                        struct ra_tex *target, bool discard,
                                         const struct ra_renderpass_input *vao,
                                         int vao_len, size_t vertex_stride,
                                         void *vertices, size_t num_vertices)
 {
     struct timer_pool *timer = NULL;
 
+    sc->params.invalidate_target = discard;
     gl_sc_generate(sc, RA_RENDERPASS_TYPE_RASTER, target->params.format,
                    vao, vao_len, vertex_stride);
     if (!sc->current_shader)
diff --git a/video/out/gpu/shader_cache.h b/video/out/gpu/shader_cache.h
index 2fe7dcf..547c6b6 100644
--- a/video/out/gpu/shader_cache.h
+++ b/video/out/gpu/shader_cache.h
@@ -50,7 +50,7 @@ void gl_sc_blend(struct gl_shader_cache *sc,
                  enum ra_blend blend_dst_alpha);
 void gl_sc_enable_extension(struct gl_shader_cache *sc, char *name);
 struct mp_pass_perf gl_sc_dispatch_draw(struct gl_shader_cache *sc,
-                                        struct ra_tex *target,
+                                        struct ra_tex *target, bool discard,
                                         const struct ra_renderpass_input *vao,
                                         int vao_len, size_t vertex_stride,
                                         void *ptr, size_t num);
diff --git a/video/out/gpu/video.c b/video/out/gpu/video.c
index f80d63a..7594c2b 100644
--- a/video/out/gpu/video.c
+++ b/video/out/gpu/video.c
@@ -313,9 +313,9 @@ static const struct gl_video_opts gl_video_opts_def = {
     .alpha_mode = ALPHA_BLEND_TILES,
     .background = {0, 0, 0, 255},
     .gamma = 1.0f,
-    .tone_mapping = TONE_MAPPING_MOBIUS,
+    .tone_mapping = TONE_MAPPING_HABLE,
     .tone_mapping_param = NAN,
-    .tone_mapping_desat = 1.0,
+    .tone_mapping_desat = 0.5,
     .early_flush = -1,
     .hwdec_interop = "auto",
 };
@@ -351,6 +351,7 @@ const struct m_sub_options gl_video_conf = {
         OPT_FLAG("gamma-auto", gamma_auto, 0),
         OPT_CHOICE_C("target-prim", target_prim, 0, mp_csp_prim_names),
         OPT_CHOICE_C("target-trc", target_trc, 0, mp_csp_trc_names),
+        OPT_INTRANGE("target-peak", target_peak, 0, 10, 10000),
         OPT_CHOICE("tone-mapping", tone_mapping, 0,
                    ({"clip",     TONE_MAPPING_CLIP},
                     {"mobius",   TONE_MAPPING_MOBIUS},
@@ -358,7 +359,10 @@ const struct m_sub_options gl_video_conf = {
                     {"hable",    TONE_MAPPING_HABLE},
                     {"gamma",    TONE_MAPPING_GAMMA},
                     {"linear",   TONE_MAPPING_LINEAR})),
-        OPT_FLAG("hdr-compute-peak", compute_hdr_peak, 0),
+        OPT_CHOICE("hdr-compute-peak", compute_hdr_peak, 0,
+                   ({"auto", 0},
+                    {"yes", 1},
+                    {"no", -1})),
         OPT_FLOAT("tone-mapping-param", tone_mapping_param, 0),
         OPT_FLOAT("tone-mapping-desaturate", tone_mapping_desat, 0),
         OPT_FLAG("gamut-warning", gamut_warning, 0),
@@ -529,9 +533,6 @@ static void uninit_rendering(struct gl_video *p)
     for (int n = 0; n < p->num_hook_textures; n++)
         ra_tex_free(p->ra, &p->hook_textures[n]);
 
-    for (int n = 0; n < 2; n++)
-        ra_tex_free(p->ra, &p->vdpau_deinterleave_tex[n]);
-
     gl_video_reset_surfaces(p);
     gl_video_reset_hooks(p);
 
@@ -548,6 +549,7 @@ struct mp_colorspace gl_video_get_output_colorspace(struct gl_video *p)
     return (struct mp_colorspace) {
         .primaries = p->opts.target_prim,
         .gamma = p->opts.target_trc,
+        .sig_peak = p->opts.target_peak / MP_REF_WHITE,
     };
 }
 
@@ -862,9 +864,6 @@ static void init_video(struct gl_video *p)
     }
     p->color_swizzle[4] = '\0';
 
-    // Format-dependent checks.
-    check_gl_features(p);
-
     mp_image_params_guess_csp(&p->image_params);
 
     av_lfg_init(&p->lfg, 1);
@@ -909,6 +908,9 @@ static void init_video(struct gl_video *p)
 
     debug_check_gl(p, "after video texture creation");
 
+    // Format-dependent checks.
+    check_gl_features(p);
+
     gl_video_setup_hooks(p);
 }
 
@@ -1001,6 +1003,9 @@ static void uninit_video(struct gl_video *p)
     p->hwdec_active = false;
     p->hwdec_overlay = NULL;
     ra_hwdec_mapper_free(&p->hwdec_mapper);
+
+    for (int n = 0; n < 2; n++)
+        ra_tex_free(p->ra, &p->vdpau_deinterleave_tex[n]);
 }
 
 static void pass_record(struct gl_video *p, struct mp_pass_perf perf)
@@ -1148,12 +1153,15 @@ static void dispatch_compute(struct gl_video *p, int w, int h,
     int num_x = info.block_w > 0 ? (w + info.block_w - 1) / info.block_w : 1,
         num_y = info.block_h > 0 ? (h + info.block_h - 1) / info.block_h : 1;
 
+    if (!(p->ra->caps & RA_CAP_NUM_GROUPS))
+        PRELUDE("#define gl_NumWorkGroups uvec3(%d, %d, 1)\n", num_x, num_y);
+
     pass_record(p, gl_sc_dispatch_compute(p->sc, num_x, num_y, 1));
     cleanup_binds(p);
 }
 
 static struct mp_pass_perf render_pass_quad(struct gl_video *p,
-                                            struct ra_fbo fbo,
+                                            struct ra_fbo fbo, bool discard,
                                             const struct mp_rect *dst)
 {
     // The first element is reserved for `vec2 position`
@@ -1211,15 +1219,15 @@ static struct mp_pass_perf render_pass_quad(struct gl_video *p,
             &p->tmp_vertex[num_vertex_attribs * 1],
             vertex_stride);
 
-    return gl_sc_dispatch_draw(p->sc, fbo.tex, p->vao, num_vertex_attribs,
+    return gl_sc_dispatch_draw(p->sc, fbo.tex, discard, p->vao, num_vertex_attribs,
                                vertex_stride, p->tmp_vertex, num_vertices);
 }
 
 static void finish_pass_fbo(struct gl_video *p, struct ra_fbo fbo,
-                            const struct mp_rect *dst)
+                            bool discard, const struct mp_rect *dst)
 {
     pass_prepare_src_tex(p);
-    pass_record(p, render_pass_quad(p, fbo, dst));
+    pass_record(p, render_pass_quad(p, fbo, discard, dst));
     debug_check_gl(p, "after rendering");
     cleanup_binds(p);
 }
@@ -1237,6 +1245,11 @@ static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
         return;
     }
 
+    // If RA_CAP_PARALLEL_COMPUTE is set, try to prefer compute shaders
+    // over fragment shaders wherever possible.
+    if (!p->pass_compute.active && (p->ra->caps & RA_CAP_PARALLEL_COMPUTE))
+        pass_is_compute(p, 16, 16);
+
     if (p->pass_compute.active) {
         gl_sc_uniform_image2D_wo(p->sc, "out_image", *dst_tex);
         if (!p->pass_compute.directly_writes)
@@ -1248,7 +1261,7 @@ static void finish_pass_tex(struct gl_video *p, struct ra_tex **dst_tex,
         debug_check_gl(p, "after dispatching compute shader");
     } else {
         struct ra_fbo fbo = { .tex = *dst_tex, };
-        finish_pass_fbo(p, fbo, &(struct mp_rect){0, 0, w, h});
+        finish_pass_fbo(p, fbo, true, &(struct mp_rect){0, 0, w, h});
     }
 }
 
@@ -2384,6 +2397,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
         .gamma = p->opts.target_trc,
         .primaries = p->opts.target_prim,
         .light = MP_CSP_LIGHT_DISPLAY,
+        .sig_peak = p->opts.target_peak / MP_REF_WHITE,
     };
 
     if (p->use_lut_3d) {
@@ -2403,6 +2417,7 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
         if (gl_video_get_lut3d(p, prim_orig, trc_orig)) {
             dst.primaries = prim_orig;
             dst.gamma = trc_orig;
+            assert(dst.primaries && dst.gamma);
         }
     }
 
@@ -2437,20 +2452,23 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
             dst.gamma = MP_CSP_TRC_GAMMA22;
     }
 
-    bool detect_peak = p->opts.compute_hdr_peak && mp_trc_is_hdr(src.gamma);
+    // If there's no specific signal peak known for the output display, infer
+    // it from the chosen transfer function
+    if (!dst.sig_peak)
+        dst.sig_peak = mp_trc_nom_peak(dst.gamma);
+
+    bool detect_peak = p->opts.compute_hdr_peak >= 0 && mp_trc_is_hdr(src.gamma);
     if (detect_peak && !p->hdr_peak_ssbo) {
         struct {
-            unsigned int sig_peak_raw;
-            unsigned int index;
-            unsigned int frame_max[PEAK_DETECT_FRAMES+1];
+            uint32_t counter;
+            uint32_t frame_idx;
+            uint32_t frame_num;
+            uint32_t frame_max[PEAK_DETECT_FRAMES+1];
+            uint32_t frame_sum[PEAK_DETECT_FRAMES+1];
+            uint32_t total_max;
+            uint32_t total_sum;
         } peak_ssbo = {0};
 
-        // Prefill with safe values
-        int safe = MP_REF_WHITE * mp_trc_nom_peak(p->image_params.color.gamma);
-        peak_ssbo.sig_peak_raw = PEAK_DETECT_FRAMES * safe;
-        for (int i = 0; i < PEAK_DETECT_FRAMES+1; i++)
-            peak_ssbo.frame_max[i] = safe;
-
         struct ra_buf_params params = {
             .type = RA_BUF_TYPE_SHADER_STORAGE,
             .size = sizeof(peak_ssbo),
@@ -2460,7 +2478,8 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
         p->hdr_peak_ssbo = ra_buf_create(ra, &params);
         if (!p->hdr_peak_ssbo) {
             MP_WARN(p, "Failed to create HDR peak detection SSBO, disabling.\n");
-            detect_peak = (p->opts.compute_hdr_peak = false);
+            detect_peak = false;
+            p->opts.compute_hdr_peak = -1;
         }
     }
 
@@ -2468,9 +2487,15 @@ static void pass_colormanage(struct gl_video *p, struct mp_colorspace src, bool
         pass_describe(p, "detect HDR peak");
         pass_is_compute(p, 8, 8); // 8x8 is good for performance
         gl_sc_ssbo(p->sc, "PeakDetect", p->hdr_peak_ssbo,
-            "uint sig_peak_raw;"
-            "uint index;"
-            "uint frame_max[%d];", PEAK_DETECT_FRAMES + 1
+            "uint counter;"
+            "uint frame_idx;"
+            "uint frame_num;"
+            "uint frame_max[%d];"
+            "uint frame_avg[%d];"
+            "uint total_max;"
+            "uint total_avg;",
+            PEAK_DETECT_FRAMES + 1,
+            PEAK_DETECT_FRAMES + 1
         );
     }
 
@@ -2603,7 +2628,10 @@ static void pass_dither(struct gl_video *p)
 static void pass_draw_osd(struct gl_video *p, int draw_flags, double pts,
                           struct mp_osd_res rect, struct ra_fbo fbo, bool cms)
 {
-    mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo_out, draw_flags);
+    if ((draw_flags & OSD_DRAW_SUB_ONLY) && (draw_flags & OSD_DRAW_OSD_ONLY))
+        return;
+
+    mpgl_osd_generate(p->osd, rect, pts, p->image_params.stereo3d, draw_flags);
 
     timer_pool_start(p->osd_timer);
     for (int n = 0; n < MAX_OSD_PARTS; n++) {
@@ -2672,7 +2700,9 @@ static void pass_render_frame_dumb(struct gl_video *p)
 
 // The main rendering function, takes care of everything up to and including
 // upscaling. p->image is rendered.
-static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t id)
+// flags: bit set of RENDER_FRAME_* flags
+static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi,
+                              uint64_t id, int flags)
 {
     // initialize the texture parameters and temporary variables
     p->texture_w = p->image_params.w;
@@ -2703,7 +2733,9 @@ static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t
     if (vpts == MP_NOPTS_VALUE)
         vpts = p->osd_pts;
 
-    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO) {
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_VIDEO &&
+        (flags & RENDER_FRAME_SUBS))
+    {
         double scale[2];
         get_scale_factors(p, false, scale);
         struct mp_osd_res rect = {
@@ -2722,7 +2754,9 @@ static bool pass_render_frame(struct gl_video *p, struct mp_image *mpi, uint64_t
 
     int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
         vp_h = p->dst_rect.y1 - p->dst_rect.y0;
-    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES) {
+    if (p->osd && p->opts.blend_subs == BLEND_SUBS_YES &&
+        (flags & RENDER_FRAME_SUBS))
+    {
         // Recreate the real video size from the src/dst rects
         struct mp_osd_res rect = {
             .w = vp_w, .h = vp_h,
@@ -2799,17 +2833,18 @@ static void pass_draw_to_screen(struct gl_video *p, struct ra_fbo fbo)
 
     pass_dither(p);
     pass_describe(p, "output to screen");
-    finish_pass_fbo(p, fbo, &p->dst_rect);
+    finish_pass_fbo(p, fbo, false, &p->dst_rect);
 }
 
+// flags: bit set of RENDER_FRAME_* flags
 static bool update_surface(struct gl_video *p, struct mp_image *mpi,
-                           uint64_t id, struct surface *surf)
+                           uint64_t id, struct surface *surf, int flags)
 {
     int vp_w = p->dst_rect.x1 - p->dst_rect.x0,
         vp_h = p->dst_rect.y1 - p->dst_rect.y0;
 
     pass_info_reset(p, false);
-    if (!pass_render_frame(p, mpi, id))
+    if (!pass_render_frame(p, mpi, id, flags))
         return false;
 
     // Frame blending should always be done in linear light to preserve the
@@ -2827,8 +2862,9 @@ static bool update_surface(struct gl_video *p, struct mp_image *mpi,
 }
 
 // Draws an interpolate frame to fbo, based on the frame timing in t
+// flags: bit set of RENDER_FRAME_* flags
 static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
-                                       struct ra_fbo fbo)
+                                       struct ra_fbo fbo, int flags)
 {
     bool is_new = false;
 
@@ -2842,7 +2878,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
     // it manually + reset the queue if not
     if (p->surfaces[p->surface_now].id == 0) {
         struct surface *now = &p->surfaces[p->surface_now];
-        if (!update_surface(p, t->current, t->frame_id, now))
+        if (!update_surface(p, t->current, t->frame_id, now, flags))
             return;
         p->surface_idx = p->surface_now;
         is_new = true;
@@ -2900,7 +2936,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
 
         if (f_id > p->surfaces[p->surface_idx].id) {
             struct surface *dst = &p->surfaces[surface_dst];
-            if (!update_surface(p, f, f_id, dst))
+            if (!update_surface(p, f, f_id, dst, flags))
                 return;
             p->surface_idx = surface_dst;
             surface_dst = surface_wrap(surface_dst + 1);
@@ -3000,7 +3036,7 @@ static void gl_video_interpolate_frame(struct gl_video *p, struct vo_frame *t,
 }
 
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct ra_fbo fbo)
+                           struct ra_fbo fbo, int flags)
 {
     gl_video_update_options(p);
 
@@ -3043,7 +3079,7 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
         }
 
         if (interpolate) {
-            gl_video_interpolate_frame(p, frame, fbo);
+            gl_video_interpolate_frame(p, frame, fbo, flags);
         } else {
             bool is_new = frame->frame_id != p->image.id;
 
@@ -3055,18 +3091,25 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
                 p->output_tex_valid = false;
 
                 pass_info_reset(p, !is_new);
-                if (!pass_render_frame(p, frame->current, frame->frame_id))
+                if (!pass_render_frame(p, frame->current, frame->frame_id, flags))
                     goto done;
 
                 // For the non-interpolation case, we draw to a single "cache"
                 // texture to speed up subsequent re-draws (if any exist)
                 struct ra_fbo dest_fbo = fbo;
                 if (frame->num_vsyncs > 1 && frame->display_synced &&
-                    !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT))
+                    !p->dumb_mode && (p->ra->caps & RA_CAP_BLIT) &&
+                    fbo.tex->params.blit_dst)
                 {
+                    // Attempt to use the same format as the destination FBO
+                    // if possible. Some RAs use a wrapped dummy format here,
+                    // so fall back to the fbo_format in that case.
+                    const struct ra_format *fmt = fbo.tex->params.format;
+                    if (fmt->dummy_format)
+                        fmt = p->fbo_format;
                     bool r = ra_tex_resize(p->ra, p->log, &p->output_tex,
                                            fbo.tex->params.w, fbo.tex->params.h,
-                                           p->fbo_format);
+                                           fmt);
                     if (r) {
                         dest_fbo = (struct ra_fbo) { p->output_tex };
                         p->output_tex_valid = true;
@@ -3076,7 +3119,7 @@ void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
             }
 
             // "output tex valid" and "output tex needed" are equivalent
-            if (p->output_tex_valid) {
+            if (p->output_tex_valid && fbo.tex->params.blit_dst) {
                 pass_info_reset(p, true);
                 pass_describe(p, "redraw cached frame");
                 struct mp_rect src = p->dst_rect;
@@ -3097,19 +3140,25 @@ done:
 
     debug_check_gl(p, "after video rendering");
 
-    if (p->osd) {
+    if (p->osd && (flags & (RENDER_FRAME_SUBS | RENDER_FRAME_OSD))) {
         // If we haven't actually drawn anything so far, then we technically
         // need to consider this the start of a new pass. Let's call it a
         // redraw just because, since it's basically a blank frame anyway
         if (!has_frame)
             pass_info_reset(p, true);
 
-        pass_draw_osd(p, p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0,
-                      p->osd_pts, p->osd_rect, fbo, true);
+        int osd_flags = p->opts.blend_subs ? OSD_DRAW_OSD_ONLY : 0;
+        if (!(flags & RENDER_FRAME_SUBS))
+            osd_flags |= OSD_DRAW_OSD_ONLY;
+        if (!(flags & RENDER_FRAME_OSD))
+            osd_flags |= OSD_DRAW_SUB_ONLY;
+
+        pass_draw_osd(p, osd_flags, p->osd_pts, p->osd_rect, fbo, true);
         debug_check_gl(p, "after OSD rendering");
     }
 
-    if (gl_sc_error_state(p->sc) || p->broken_frame) {
+    p->broken_frame |= gl_sc_error_state(p->sc);
+    if (p->broken_frame) {
         // Make the screen solid blue to make it visually clear that an
         // error has occurred
         float color[4] = {0.0, 0.05, 0.5, 1.0};
@@ -3120,6 +3169,100 @@ done:
     pass_report_performance(p);
 }
 
+void gl_video_screenshot(struct gl_video *p, struct vo_frame *frame,
+                         struct voctrl_screenshot *args)
+{
+    if (!p->ra->fns->tex_download)
+        return;
+
+    bool ok = false;
+    struct mp_image *res = NULL;
+    struct ra_tex *target = NULL;
+    struct mp_rect old_src = p->src_rect;
+    struct mp_rect old_dst = p->dst_rect;
+    struct mp_osd_res old_osd = p->osd_rect;
+    struct vo_frame *nframe = vo_frame_ref(frame);
+
+    // Disable interpolation and such.
+    nframe->redraw = true;
+    nframe->repeat = false;
+    nframe->still = true;
+    nframe->pts = 0;
+    nframe->duration = -1;
+
+    if (!args->scaled) {
+        int w, h;
+        mp_image_params_get_dsize(&p->image_params, &w, &h);
+        if (w < 1 || h < 1)
+            return;
+
+        if (p->image_params.rotate % 180 == 90)
+            MPSWAP(int, w, h);
+
+        struct mp_rect src = {0, 0, p->image_params.w, p->image_params.h};
+        struct mp_rect dst = {0, 0, w, h};
+        struct mp_osd_res osd = {.w = w, .h = h, .display_par = 1.0};
+        gl_video_resize(p, &src, &dst, &osd);
+    }
+
+    gl_video_reset_surfaces(p);
+
+    struct ra_tex_params params = {
+        .dimensions = 2,
+        .downloadable = true,
+        .w = p->osd_rect.w,
+        .h = p->osd_rect.h,
+        .render_dst = true,
+    };
+
+    params.format = ra_find_unorm_format(p->ra, 1, 4);
+    int mpfmt = IMGFMT_RGB0;
+    if (args->high_bit_depth && p->ra_format.component_bits > 8) {
+        const struct ra_format *fmt = ra_find_unorm_format(p->ra, 2, 4);
+        if (fmt && fmt->renderable) {
+            params.format = fmt;
+            mpfmt = IMGFMT_RGBA64;
+        }
+    }
+
+    if (!params.format || !params.format->renderable)
+        goto done;
+    target = ra_tex_create(p->ra, &params);
+    if (!target)
+        goto done;
+
+    int flags = 0;
+    if (args->subs)
+        flags |= RENDER_FRAME_SUBS;
+    if (args->osd)
+        flags |= RENDER_FRAME_OSD;
+    gl_video_render_frame(p, nframe, (struct ra_fbo){target}, flags);
+
+    res = mp_image_alloc(mpfmt, params.w, params.h);
+    if (!res)
+        goto done;
+
+    struct ra_tex_download_params download_params = {
+        .tex = target,
+        .dst = res->planes[0],
+        .stride = res->stride[0],
+    };
+    if (!p->ra->fns->tex_download(p->ra, &download_params))
+        goto done;
+
+    if (p->broken_frame)
+        goto done;
+
+    ok = true;
+done:
+    talloc_free(nframe);
+    ra_tex_free(p->ra, &target);
+    gl_video_resize(p, &old_src, &old_dst, &old_osd);
+    if (!ok)
+        TA_FREEP(&res);
+    args->res = res;
+}
+
 // Use this color instead of the global option.
 void gl_video_set_clear_color(struct gl_video *p, struct m_color c)
 {
@@ -3154,7 +3297,7 @@ void gl_video_resize(struct gl_video *p,
     gl_video_reset_surfaces(p);
 
     if (p->osd)
-        mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo_out);
+        mpgl_osd_resize(p->osd, p->osd_rect, p->image_params.stereo3d);
 }
 
 static void frame_perf_data(struct pass_info pass[], struct mp_frame_perf *out)
@@ -3205,7 +3348,7 @@ static void reinterleave_vdpau(struct gl_video *p,
         const struct ra_format *fmt = ra_find_unorm_format(p->ra, 1, comps);
         ra_tex_resize(p->ra, p->log, tex, w, h * 2, fmt);
         struct ra_fbo fbo = { *tex };
-        finish_pass_fbo(p, fbo, &(struct mp_rect){0, 0, w, h * 2});
+        finish_pass_fbo(p, fbo, true, &(struct mp_rect){0, 0, w, h * 2});
 
         output[n] = *tex;
     }
@@ -3333,7 +3476,7 @@ static bool test_fbo(struct gl_video *p, const struct ra_format *fmt)
 }
 
 // Return whether dumb-mode can be used without disabling any features.
-// Essentially, vo_opengl with mostly default settings will return true.
+// Essentially, vo_gpu with mostly default settings will return true.
 static bool check_dumb_mode(struct gl_video *p)
 {
     struct gl_video_opts *o = &p->opts;
@@ -3374,25 +3517,63 @@ static void check_gl_features(struct gl_video *p)
     bool have_texrg = rg_tex && !rg_tex->luminance_alpha;
     bool have_compute = ra->caps & RA_CAP_COMPUTE;
     bool have_ssbo = ra->caps & RA_CAP_BUF_RW;
+    bool have_fragcoord = ra->caps & RA_CAP_FRAGCOORD;
 
-    const char *auto_fbo_fmts[] = {"rgba16", "rgba16f", "rgba16hf",
+    const char *auto_fbo_fmts[] = {"rgba16f", "rgba16hf", "rgba16",
                                    "rgb10_a2", "rgba8", 0};
     const char *user_fbo_fmts[] = {p->opts.fbo_format, 0};
     const char **fbo_fmts = user_fbo_fmts[0] && strcmp(user_fbo_fmts[0], "auto")
                           ? user_fbo_fmts : auto_fbo_fmts;
+    bool user_specified_fbo_fmt = fbo_fmts == user_fbo_fmts;
+    bool fbo_test_result = false;
     bool have_fbo = false;
     p->fbo_format = NULL;
     for (int n = 0; fbo_fmts[n]; n++) {
         const char *fmt = fbo_fmts[n];
         const struct ra_format *f = ra_find_named_format(p->ra, fmt);
-        if (!f && fbo_fmts == user_fbo_fmts)
+        if (!f && user_specified_fbo_fmt)
             MP_WARN(p, "FBO format '%s' not found!\n", fmt);
-        if (f && f->renderable && f->linear_filter && test_fbo(p, f)) {
+        if (f && f->renderable && f->linear_filter &&
+            (fbo_test_result = test_fbo(p, f))) {
             MP_VERBOSE(p, "Using FBO format %s.\n", f->name);
             have_fbo = true;
             p->fbo_format = f;
             break;
         }
+
+        if (user_specified_fbo_fmt) {
+            MP_WARN(p, "User-specified FBO format '%s' failed to initialize! "
+                       "(exists=%d, renderable=%d, linear_filter=%d, "
+                       "fbo_test_result=%d)\n",
+                    fmt, !!f, f ? f->renderable : 0,  f ? f->linear_filter : 0,
+                    fbo_test_result);
+        }
+    }
+
+    if (!have_fragcoord && p->opts.dither_depth >= 0 &&
+        p->opts.dither_algo != DITHER_NONE)
+    {
+        p->opts.dither_algo = DITHER_NONE;
+        MP_WARN(p, "Disabling dithering (no gl_FragCoord).\n");
+    }
+    if (!have_fragcoord && p->opts.alpha_mode == ALPHA_BLEND_TILES) {
+        p->opts.alpha_mode = ALPHA_BLEND;
+        // Verbose, since this is the default setting
+        MP_VERBOSE(p, "Disabling alpha checkerboard (no gl_FragCoord).\n");
+    }
+    if (!have_fbo && have_compute) {
+        have_compute = false;
+        MP_WARN(p, "Force-disabling compute shaders as an FBO format was not "
+                   "available! See your FBO format configuration!\n");
+    }
+
+    bool have_compute_peak = have_compute && have_ssbo;
+    if (!have_compute_peak && p->opts.compute_hdr_peak >= 0) {
+        int msgl = p->opts.compute_hdr_peak == 1 ? MSGL_WARN : MSGL_V;
+        MP_MSG(p, msgl, "Disabling HDR peak computation (one or more of the "
+                        "following is not supported: compute shaders=%d, "
+                        "SSBO=%d).\n", have_compute, have_ssbo);
+        p->opts.compute_hdr_peak = -1;
     }
 
     p->forced_dumb_mode = p->opts.dumb_mode > 0 || !have_fbo || !have_texrg;
@@ -3414,6 +3595,7 @@ static void check_gl_features(struct gl_video *p)
             .alpha_mode = p->opts.alpha_mode,
             .use_rectangle = p->opts.use_rectangle,
             .background = p->opts.background,
+            .compute_hdr_peak = p->opts.compute_hdr_peak,
             .dither_algo = p->opts.dither_algo,
             .dither_depth = p->opts.dither_depth,
             .dither_size = p->opts.dither_size,
@@ -3479,23 +3661,6 @@ static void check_gl_features(struct gl_video *p)
         p->opts.deband = 0;
         MP_WARN(p, "Disabling debanding (GLSL version too old).\n");
     }
-    if ((!have_compute || !have_ssbo) && p->opts.compute_hdr_peak) {
-        p->opts.compute_hdr_peak = 0;
-        MP_WARN(p, "Disabling HDR peak computation (no compute shaders).\n");
-    }
-    if (!(ra->caps & RA_CAP_FRAGCOORD) && p->opts.dither_depth >= 0 &&
-        p->opts.dither_algo != DITHER_NONE)
-    {
-        p->opts.dither_algo = DITHER_NONE;
-        MP_WARN(p, "Disabling dithering (no gl_FragCoord).\n");
-    }
-    if (!(ra->caps & RA_CAP_FRAGCOORD) &&
-        p->opts.alpha_mode == ALPHA_BLEND_TILES)
-    {
-        p->opts.alpha_mode = ALPHA_BLEND;
-        // Verbose, since this is the default setting
-        MP_VERBOSE(p, "Disabling alpha checkerboard (no gl_FragCoord).\n");
-    }
 }
 
 static void init_gl(struct gl_video *p)
@@ -3838,6 +4003,9 @@ static void gl_video_dr_free_buffer(void *opaque, uint8_t *data)
 struct mp_image *gl_video_get_image(struct gl_video *p, int imgfmt, int w, int h,
                                     int stride_align)
 {
+    if (!gl_video_check_format(p, imgfmt))
+        return NULL;
+
     int size = mp_image_get_alloc_size(imgfmt, w, h, stride_align);
     if (size < 0)
         return NULL;
diff --git a/video/out/gpu/video.h b/video/out/gpu/video.h
index 78f8828..2184599 100644
--- a/video/out/gpu/video.h
+++ b/video/out/gpu/video.h
@@ -96,7 +96,7 @@ enum tone_mapping {
 };
 
 // How many frames to average over for HDR peak detection
-#define PEAK_DETECT_FRAMES 100
+#define PEAK_DETECT_FRAMES 63
 
 struct gl_video_opts {
     int dumb_mode;
@@ -106,7 +106,7 @@ struct gl_video_opts {
     int gamma_auto;
     int target_prim;
     int target_trc;
-    int target_brightness;
+    int target_peak;
     int tone_mapping;
     int compute_hdr_peak;
     float tone_mapping_param;
@@ -146,6 +146,13 @@ extern const struct m_sub_options gl_video_conf;
 
 struct gl_video;
 struct vo_frame;
+struct voctrl_screenshot;
+
+enum {
+    RENDER_FRAME_SUBS = 1 << 0,
+    RENDER_FRAME_OSD = 2 << 0,
+    RENDER_FRAME_DEF = RENDER_FRAME_SUBS | RENDER_FRAME_OSD,
+};
 
 struct gl_video *gl_video_init(struct ra *ra, struct mp_log *log,
                                struct mpv_global *g);
@@ -153,9 +160,8 @@ void gl_video_uninit(struct gl_video *p);
 void gl_video_set_osd_source(struct gl_video *p, struct osd_state *osd);
 bool gl_video_check_format(struct gl_video *p, int mp_format);
 void gl_video_config(struct gl_video *p, struct mp_image_params *params);
-void gl_video_set_output_depth(struct gl_video *p, int r, int g, int b);
 void gl_video_render_frame(struct gl_video *p, struct vo_frame *frame,
-                           struct ra_fbo fbo);
+                           struct ra_fbo fbo, int flags);
 void gl_video_resize(struct gl_video *p,
                      struct mp_rect *src, struct mp_rect *dst,
                      struct mp_osd_res *osd);
@@ -166,6 +172,9 @@ void gl_video_set_osd_pts(struct gl_video *p, double pts);
 bool gl_video_check_osd_change(struct gl_video *p, struct mp_osd_res *osd,
                                double pts);
 
+void gl_video_screenshot(struct gl_video *p, struct vo_frame *frame,
+                         struct voctrl_screenshot *args);
+
 float gl_video_scale_ambient_lux(float lmin, float lmax,
                                  float rmin, float rmax, float lux);
 void gl_video_set_ambient_lux(struct gl_video *p, int lux);
diff --git a/video/out/gpu/video_shaders.c b/video/out/gpu/video_shaders.c
index 3e71c31..2b18d17 100644
--- a/video/out/gpu/video_shaders.c
+++ b/video/out/gpu/video_shaders.c
@@ -334,6 +334,10 @@ static const float SLOG_A = 0.432699,
 // Linearize (expand), given a TRC as input. In essence, this is the ITU-R
 // EOTF, calculated on an idealized (reference) monitor with a white point of
 // MP_REF_WHITE and infinite contrast.
+//
+// These functions always output to a normalized scale of [0,1], for
+// convenience of the video.c code that calls it. To get the values in an
+// absolute scale, multiply the result by `mp_trc_nom_peak(trc)`
 void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 {
     if (trc == MP_CSP_TRC_LINEAR)
@@ -417,6 +421,8 @@ void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 // Delinearize (compress), given a TRC as output. This corresponds to the
 // inverse EOTF (not the OETF) in ITU-R terminology, again assuming a
 // reference monitor.
+//
+// Like pass_linearize, this functions ingests values on an normalized scale
 void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 {
     if (trc == MP_CSP_TRC_LINEAR)
@@ -488,24 +494,25 @@ void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc)
 }
 
 // Apply the OOTF mapping from a given light type to display-referred light.
-// The extra peak parameter is used to scale the values before and after
-// the OOTF, and can be inferred using mp_trc_nom_peak
-void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
+// Assumes absolute scale values. `peak` is used to tune the OOTF where
+// applicable (currently only HLG).
+static void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light,
+                      float peak)
 {
     if (light == MP_CSP_LIGHT_DISPLAY)
         return;
 
     GLSLF("// apply ootf\n");
-    GLSLF("color.rgb *= vec3(%f);\n", peak);
 
     switch (light)
     {
-    case MP_CSP_LIGHT_SCENE_HLG:
-        // HLG OOTF from BT.2100, assuming a reference display with a
-        // peak of 1000 cd/m² -> gamma = 1.2
-        GLSLF("color.rgb *= vec3(%f * pow(dot(src_luma, color.rgb), 0.2));\n",
-              (1000 / MP_REF_WHITE) / pow(12, 1.2));
+    case MP_CSP_LIGHT_SCENE_HLG: {
+        // HLG OOTF from BT.2100, scaled to the chosen display peak
+        float gamma = MPMAX(1.0, 1.2 + 0.42 * log10(peak * MP_REF_WHITE / 1000.0));
+        GLSLF("color.rgb *= vec3(%f * pow(dot(src_luma, color.rgb), %f));\n",
+              peak / pow(12, gamma), gamma - 1.0);
         break;
+    }
     case MP_CSP_LIGHT_SCENE_709_1886:
         // This OOTF is defined by encoding the result as 709 and then decoding
         // it as 1886; although this is called 709_1886 we actually use the
@@ -521,25 +528,26 @@ void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
     default:
         abort();
     }
-
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
 }
 
 // Inverse of the function pass_ootf, for completeness' sake.
-void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak)
+static void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light,
+                              float peak)
 {
     if (light == MP_CSP_LIGHT_DISPLAY)
         return;
 
     GLSLF("// apply inverse ootf\n");
-    GLSLF("color.rgb *= vec3(%f);\n", peak);
 
     switch (light)
     {
-    case MP_CSP_LIGHT_SCENE_HLG:
-        GLSLF("color.rgb *= vec3(1.0/%f);\n", (1000 / MP_REF_WHITE) / pow(12, 1.2));
-        GLSL(color.rgb /= vec3(max(1e-6, pow(dot(src_luma, color.rgb), 0.2/1.2)));)
+    case MP_CSP_LIGHT_SCENE_HLG: {
+        float gamma = MPMAX(1.0, 1.2 + 0.42 * log10(peak * MP_REF_WHITE / 1000.0));
+        GLSLF("color.rgb *= vec3(1.0/%f);\n", peak / pow(12, gamma));
+        GLSLF("color.rgb /= vec3(max(1e-6, pow(dot(src_luma, color.rgb), %f)));\n",
+              (gamma - 1.0) / gamma);
         break;
+    }
     case MP_CSP_LIGHT_SCENE_709_1886:
         GLSL(color.rgb = pow(color.rgb, vec3(1.0/2.4));)
         GLSL(color.rgb = mix(color.rgb * vec3(1.0/4.5),
@@ -553,13 +561,89 @@ void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, floa
     default:
         abort();
     }
+}
 
-    GLSLF("color.rgb *= vec3(1.0/%f);\n", peak);
+// Average light level for SDR signals. This is equal to a signal level of 0.5
+// under a typical presentation gamma of about 2.0.
+static const float sdr_avg = 0.25;
+
+// The threshold for which to consider an average luminance difference to be
+// a sign of a scene change.
+static const int scene_threshold = 0.2 * MP_REF_WHITE;
+
+static void hdr_update_peak(struct gl_shader_cache *sc)
+{
+    // For performance, we want to do as few atomic operations on global
+    // memory as possible, so use an atomic in shmem for the work group.
+    GLSLH(shared uint wg_sum;);
+    GLSL(wg_sum = 0;)
+
+    // Have each thread update the work group sum with the local value
+    GLSL(barrier();)
+    GLSLF("atomicAdd(wg_sum, uint(sig * %f));\n", MP_REF_WHITE);
+
+    // Have one thread per work group update the global atomics. We use the
+    // work group average even for the global sum, to make the values slightly
+    // more stable and smooth out tiny super-highlights.
+    GLSL(memoryBarrierShared();)
+    GLSL(barrier();)
+    GLSL(if (gl_LocalInvocationIndex == 0) {)
+    GLSL(    uint wg_avg = wg_sum / (gl_WorkGroupSize.x * gl_WorkGroupSize.y);)
+    GLSL(    atomicMax(frame_max[frame_idx], wg_avg);)
+    GLSL(    atomicAdd(frame_avg[frame_idx], wg_avg);)
+    GLSL(})
+
+    const float refi = 1.0 / MP_REF_WHITE;
+
+    // Update the sig_peak/sig_avg from the old SSBO state
+    GLSL(uint num_wg = gl_NumWorkGroups.x * gl_NumWorkGroups.y;)
+    GLSL(if (frame_num > 0) {)
+    GLSLF("    float peak = %f * float(total_max) / float(frame_num);\n", refi);
+    GLSLF("    float avg = %f * float(total_avg) / float(frame_num);\n", refi);
+    GLSLF("    sig_peak = max(1.0, peak);\n");
+    GLSLF("    sig_avg  = max(%f, avg);\n", sdr_avg);
+    GLSL(});
+
+    // Finally, to update the global state, we increment a counter per dispatch
+    GLSL(memoryBarrierBuffer();)
+    GLSL(barrier();)
+    GLSL(if (gl_LocalInvocationIndex == 0 && atomicAdd(counter, 1) == num_wg - 1) {)
+
+    // Since we sum up all the workgroups, we also still need to divide the
+    // average by the number of work groups
+    GLSL(    counter = 0;)
+    GLSL(    frame_avg[frame_idx] /= num_wg;)
+    GLSL(    uint cur_max = frame_max[frame_idx];)
+    GLSL(    uint cur_avg = frame_avg[frame_idx];)
+
+    // Scene change detection
+    GLSL(    int diff = int(frame_num * cur_avg) - int(total_avg);)
+    GLSLF("  if (abs(diff) > frame_num * %d) {\n", scene_threshold);
+    GLSL(        frame_num = 0;)
+    GLSL(        total_max = total_avg = 0;)
+    GLSLF("      for (uint i = 0; i < %d; i++)\n", PEAK_DETECT_FRAMES+1);
+    GLSL(            frame_max[i] = frame_avg[i] = 0;)
+    GLSL(        frame_max[frame_idx] = cur_max;)
+    GLSL(        frame_avg[frame_idx] = cur_avg;)
+    GLSL(    })
+
+    // Add the current frame, then subtract and reset the next frame
+    GLSLF("  uint next = (frame_idx + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
+    GLSL(    total_max += cur_max - frame_max[next];)
+    GLSL(    total_avg += cur_avg - frame_avg[next];)
+    GLSL(    frame_max[next] = frame_avg[next] = 0;)
+
+    // Update the index and count
+    GLSL(    frame_idx = next;)
+    GLSLF("  frame_num = min(frame_num + 1, %d);\n", PEAK_DETECT_FRAMES);
+    GLSL(    memoryBarrierBuffer();)
+    GLSL(})
 }
 
 // Tone map from a known peak brightness to the range [0,1]. If ref_peak
 // is 0, we will use peak detection instead
-static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
+static void pass_tone_map(struct gl_shader_cache *sc, bool detect_peak,
+                          float src_peak, float dst_peak,
                           enum tone_mapping algo, float param, float desat)
 {
     GLSLF("// HDR tone mapping\n");
@@ -568,57 +652,44 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
     // sure to reduce the value range as far as necessary to keep the entire
     // signal in range, so tone map based on the brightest component.
     GLSL(float sig = max(max(color.r, color.g), color.b);)
+    GLSLF("float sig_peak = %f;\n", src_peak);
+    GLSLF("float sig_avg = %f;\n", sdr_avg);
+
+    if (detect_peak)
+        hdr_update_peak(sc);
+
+    // Rescale the variables in order to bring it into a representation where
+    // 1.0 represents the dst_peak. This is because all of the tone mapping
+    // algorithms are defined in such a way that they map to the range [0.0, 1.0].
+    if (dst_peak > 1.0) {
+        GLSLF("sig *= %f;\n", 1.0 / dst_peak);
+        GLSLF("sig_peak *= %f;\n", 1.0 / dst_peak);
+    }
 
-    // Desaturate the color using a coefficient dependent on the signal
+    GLSL(float sig_orig = sig;)
+    GLSLF("float slope = min(1.0, %f / sig_avg);\n", sdr_avg);
+    GLSL(sig *= slope;)
+    GLSL(sig_peak *= slope;)
+
+    // Desaturate the color using a coefficient dependent on the signal.
+    // Do this after peak detection in order to prevent over-desaturating
+    // overly bright souces
     if (desat > 0) {
+        float base = 0.18 * dst_peak;
         GLSL(float luma = dot(dst_luma, color.rgb);)
-        GLSL(float coeff = max(sig - 0.18, 1e-6) / max(sig, 1e-6););
+        GLSLF("float coeff = max(sig - %f, 1e-6) / max(sig, 1e-6);\n", base);
         GLSLF("coeff = pow(coeff, %f);\n", 10.0 / desat);
         GLSL(color.rgb = mix(color.rgb, vec3(luma), coeff);)
-        GLSL(sig = mix(sig, luma, coeff);) // also make sure to update `sig`
-    }
-
-    if (!ref_peak) {
-        // For performance, we want to do as few atomic operations on global
-        // memory as possible, so use an atomic in shmem for the work group.
-        // We also want slightly more stable values, so use the group average
-        // instead of the group max
-        GLSLHF("shared uint group_sum = 0;\n");
-        GLSLF("atomicAdd(group_sum, uint(sig * %f));\n", MP_REF_WHITE);
-
-        // Have one thread in each work group update the frame maximum
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSL(if (gl_LocalInvocationIndex == 0))
-            GLSL(atomicMax(frame_max[index], group_sum /
-                 (gl_WorkGroupSize.x * gl_WorkGroupSize.y));)
-
-        // Finally, have one thread per invocation update the total maximum
-        // and advance the index
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSL(if (gl_GlobalInvocationID == ivec3(0)) {) // do this once per invocation
-            GLSLF("uint next = (index + 1) %% %d;\n", PEAK_DETECT_FRAMES+1);
-            GLSLF("sig_peak_raw = sig_peak_raw + frame_max[index] - frame_max[next];\n");
-            GLSLF("frame_max[next] = %d;\n", (int)MP_REF_WHITE);
-            GLSL(index = next;)
-        GLSL(})
-
-        GLSL(memoryBarrierBuffer();)
-        GLSL(barrier();)
-        GLSLF("float sig_peak = 1.0/%f * float(sig_peak_raw);\n",
-              MP_REF_WHITE * PEAK_DETECT_FRAMES);
-    } else {
-        GLSLHF("const float sig_peak = %f;\n", ref_peak);
+        GLSL(sig = mix(sig, luma * slope, coeff);) // also make sure to update `sig`
     }
 
-    GLSL(float sig_orig = sig;)
     switch (algo) {
     case TONE_MAPPING_CLIP:
         GLSLF("sig = %f * sig;\n", isnan(param) ? 1.0 : param);
         break;
 
     case TONE_MAPPING_MOBIUS:
+        GLSLF("if (sig_peak > (1.0 + 1e-6)) {\n");
         GLSLF("const float j = %f;\n", isnan(param) ? 0.3 : param);
         // solve for M(j) = j; M(sig_peak) = 1.0; M'(j) = 1.0
         // where M(x) = scale * (x+a)/(x+b)
@@ -627,6 +698,7 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
               "max(1e-6, sig_peak - 1.0);\n");
         GLSLF("float scale = (b*b + 2.0*b*j + j*j) / (b-a);\n");
         GLSL(sig = sig > j ? scale * (sig + a) / (sig + b) : sig;)
+        GLSLF("}\n");
         break;
 
     case TONE_MAPPING_REINHARD: {
@@ -668,7 +740,8 @@ static void pass_tone_map(struct gl_shader_cache *sc, float ref_peak,
 
     // Apply the computed scale factor to the color, linearly to prevent
     // discoloration
-    GLSL(color.rgb *= sig / sig_orig;)
+    GLSL(sig = min(sig, 1.0);)
+    GLSL(color.rgb *= vec3(sig / sig_orig);)
 }
 
 // Map colors from one source space to another. These source spaces must be
@@ -686,11 +759,6 @@ void pass_color_map(struct gl_shader_cache *sc,
 {
     GLSLF("// color mapping\n");
 
-    // Compute the highest encodable level
-    float src_range = mp_trc_nom_peak(src.gamma),
-          dst_range = mp_trc_nom_peak(dst.gamma);
-    float ref_peak = src.sig_peak / dst_range;
-
     // Some operations need access to the video's luma coefficients, so make
     // them available
     float rgb2xyz[3][3];
@@ -699,30 +767,29 @@ void pass_color_map(struct gl_shader_cache *sc,
     mp_get_rgb2xyz_matrix(mp_get_csp_primaries(dst.primaries), rgb2xyz);
     gl_sc_uniform_vec3(sc, "dst_luma", rgb2xyz[1]);
 
+    bool need_ootf = src.light != dst.light;
+    if (src.light == MP_CSP_LIGHT_SCENE_HLG && src.sig_peak != dst.sig_peak)
+        need_ootf = true;
+
     // All operations from here on require linear light as a starting point,
     // so we linearize even if src.gamma == dst.gamma when one of the other
     // operations needs it
-    bool need_gamma = src.gamma != dst.gamma ||
-                      src.primaries != dst.primaries ||
-                      src_range != dst_range ||
-                      src.sig_peak > dst_range ||
-                      src.light != dst.light;
+    bool need_linear = src.gamma != dst.gamma ||
+                       src.primaries != dst.primaries ||
+                       src.sig_peak > dst.sig_peak ||
+                       need_ootf;
 
-    if (need_gamma && !is_linear) {
+    if (need_linear && !is_linear) {
+        // We also pull it up so that 1.0 is the reference white
         pass_linearize(sc, src.gamma);
-        is_linear= true;
+        is_linear = true;
     }
 
-    if (src.light != dst.light)
-        pass_ootf(sc, src.light, mp_trc_nom_peak(src.gamma));
+    // Pre-scale the incoming values into an absolute scale
+    GLSLF("color.rgb *= vec3(%f);\n", mp_trc_nom_peak(src.gamma));
 
-    // Rescale the signal to compensate for differences in the encoding range
-    // and reference white level. This is necessary because of how mpv encodes
-    // brightness in textures.
-    if (src_range != dst_range) {
-        GLSLF("// rescale value range;\n");
-        GLSLF("color.rgb *= vec3(%f);\n", src_range / dst_range);
-    }
+    if (need_ootf)
+        pass_ootf(sc, src.light, src.sig_peak);
 
     // Adapt to the right colorspace if necessary
     if (src.primaries != dst.primaries) {
@@ -732,20 +799,26 @@ void pass_color_map(struct gl_shader_cache *sc,
         mp_get_cms_matrix(csp_src, csp_dst, MP_INTENT_RELATIVE_COLORIMETRIC, m);
         gl_sc_uniform_mat3(sc, "cms_matrix", true, &m[0][0]);
         GLSL(color.rgb = cms_matrix * color.rgb;)
-        // Since this can reduce the gamut, figure out by how much
-        for (int c = 0; c < 3; c++)
-            ref_peak = MPMAX(ref_peak, m[c][c]);
     }
 
     // Tone map to prevent clipping when the source signal peak exceeds the
     // encodable range or we've reduced the gamut
-    if (ref_peak > 1) {
-        pass_tone_map(sc, detect_peak ? 0 : ref_peak, algo,
+    if (src.sig_peak > dst.sig_peak) {
+        pass_tone_map(sc, detect_peak, src.sig_peak, dst.sig_peak, algo,
                       tone_mapping_param, tone_mapping_desat);
     }
 
-    if (src.light != dst.light)
-        pass_inverse_ootf(sc, dst.light, mp_trc_nom_peak(dst.gamma));
+    if (need_ootf)
+        pass_inverse_ootf(sc, dst.light, dst.sig_peak);
+
+    // Post-scale the outgoing values from absolute scale to normalized.
+    // For SDR, we normalize to the chosen signal peak. For HDR, we normalize
+    // to the encoding range of the transfer function.
+    float dst_range = dst.sig_peak;
+    if (mp_trc_is_hdr(dst.gamma))
+        dst_range = mp_trc_nom_peak(dst.gamma);
+
+    GLSLF("color.rgb *= vec3(%f);\n", 1.0 / dst_range);
 
     // Warn for remaining out-of-gamut colors is enabled
     if (gamut_warning) {
diff --git a/video/out/gpu/video_shaders.h b/video/out/gpu/video_shaders.h
index 2ae2ac3..cd395d6 100644
--- a/video/out/gpu/video_shaders.h
+++ b/video/out/gpu/video_shaders.h
@@ -39,8 +39,6 @@ void pass_sample_oversample(struct gl_shader_cache *sc, struct scaler *scaler,
 
 void pass_linearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
 void pass_delinearize(struct gl_shader_cache *sc, enum mp_csp_trc trc);
-void pass_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
-void pass_inverse_ootf(struct gl_shader_cache *sc, enum mp_csp_light light, float peak);
 
 void pass_color_map(struct gl_shader_cache *sc,
                     struct mp_colorspace src, struct mp_colorspace dst,
diff --git a/video/out/libmpv.h b/video/out/libmpv.h
new file mode 100644
index 0000000..2fe3338
--- /dev/null
+++ b/video/out/libmpv.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include <stdint.h>
+#include <stdbool.h>
+#include "libmpv/render.h"
+#include "vo.h"
+
+// Helper for finding a parameter value. It returns the direct pointer to the
+// value, and if not present, just returns the def argument. In particular, if
+// def is not NULL, this never returns NULL (unless a param value is defined
+// as accepting NULL, or the libmpv API user is triggering UB).
+void *get_mpv_render_param(mpv_render_param *params, mpv_render_param_type type,
+                           void *def);
+
+#define GET_MPV_RENDER_PARAM(params, type, ctype, def) \
+    (*(ctype *)get_mpv_render_param(params, type, &(ctype){(def)}))
+
+typedef int (*mp_render_cb_control_fn)(struct vo *vo, void *cb_ctx, int *events,
+                                       uint32_t request, void *data);
+void mp_render_context_set_control_callback(mpv_render_context *ctx,
+                                            mp_render_cb_control_fn callback,
+                                            void *callback_ctx);
+bool mp_render_context_acquire(mpv_render_context *ctx);
+
+struct render_backend {
+    struct mpv_global *global;
+    struct mp_log *log;
+    const struct render_backend_fns *fns;
+
+    // Set on init, immutable afterwards.
+    int driver_caps;
+    struct mp_hwdec_devices *hwdec_devs;
+
+    void *priv;
+};
+
+// Generic backend for rendering via libmpv. This corresponds to vo/vo_driver,
+// except for rendering via the mpv_render_*() API. (As a consequence it's as
+// generic as the VO API.) Like with VOs, one backend can support multiple
+// underlying GPU APIs.
+struct render_backend_fns {
+    // Returns libmpv error code. In particular, this function has to check for
+    // MPV_RENDER_PARAM_API_TYPE, and silently return MPV_ERROR_NOT_IMPLEMENTED
+    // if the API is not included in this backend.
+    // If this fails, ->destroy() will be called.
+    int (*init)(struct render_backend *ctx, mpv_render_param *params);
+    // Check if the passed IMGFMT_ is supported.
+    bool (*check_format)(struct render_backend *ctx, int imgfmt);
+    // Implementation of mpv_render_context_set_parameter(). Optional.
+    int (*set_parameter)(struct render_backend *ctx, mpv_render_param param);
+    // Like vo_driver.reconfig().
+    void (*reconfig)(struct render_backend *ctx, struct mp_image_params *params);
+    // Like VOCTRL_RESET.
+    void (*reset)(struct render_backend *ctx);
+    void (*screenshot)(struct render_backend *ctx, struct vo_frame *frame,
+                       struct voctrl_screenshot *args);
+    // Like vo_driver.get_image().
+    struct mp_image *(*get_image)(struct render_backend *ctx, int imgfmt,
+                                  int w, int h, int stride_align);
+    // This has two purposes: 1. set queue attributes on VO, 2. update the
+    // renderer's OSD pointer. Keep in mind that as soon as the caller releases
+    // the renderer lock, the VO pointer can become invalid. The OSD pointer
+    // will technically remain valid (even though it's a vo field), until it's
+    // unset with this function.
+    // Will be called if vo changes, or if renderer options change.
+    void (*update_external)(struct render_backend *ctx, struct vo *vo);
+    // Update screen area.
+    void (*resize)(struct render_backend *ctx, struct mp_rect *src,
+                   struct mp_rect *dst, struct mp_osd_res *osd);
+    // Get target surface size from mpv_render_context_render() arguments.
+    int (*get_target_size)(struct render_backend *ctx, mpv_render_param *params,
+                           int *out_w, int *out_h);
+    // Implementation of mpv_render_context_render().
+    int (*render)(struct render_backend *ctx, mpv_render_param *params,
+                  struct vo_frame *frame);
+    // Free all data in ctx->priv.
+    void (*destroy)(struct render_backend *ctx);
+};
+
+extern const struct render_backend_fns render_backend_gpu;
diff --git a/video/out/opengl/common.c b/video/out/opengl/common.c
index fda40da..4b0cbcc 100644
--- a/video/out/opengl/common.c
+++ b/video/out/opengl/common.c
@@ -453,6 +453,7 @@ static const struct gl_functions gl_functions[] = {
     },
     // These don't exist - they are for the sake of mpv internals, and libmpv
     // interaction (see libmpv/opengl_cb.h).
+    // This is not used by the render API, only the deprecated opengl-cb API.
     {
         .extension = "GL_MP_MPGetNativeDisplay",
         .functions = (const struct gl_function[]) {
@@ -664,13 +665,3 @@ void mpgl_load_functions(GL *gl, void *(*getProcAddress)(const GLubyte *),
 {
     mpgl_load_functions2(gl, get_procaddr_wrapper, getProcAddress, ext2, log);
 }
-
-void *mpgl_get_native_display(struct GL *gl, const char *name)
-{
-    void *res = NULL;
-    if (gl->get_native_display)
-        res = gl->get_native_display(gl->get_native_display_ctx, name);
-    if (!res && gl->MPGetNativeDisplay)
-        res = gl->MPGetNativeDisplay(name);
-    return res;
-}
diff --git a/video/out/opengl/common.h b/video/out/opengl/common.h
index b9f582b..38414fe 100644
--- a/video/out/opengl/common.h
+++ b/video/out/opengl/common.h
@@ -78,9 +78,6 @@ void mpgl_load_functions2(GL *gl, void *(*get_fn)(void *ctx, const char *n),
 typedef void (GLAPIENTRY *MP_GLDEBUGPROC)(GLenum, GLenum, GLuint, GLenum,
                                           GLsizei, const GLchar *,const void *);
 
-// Return a named host API reference (e.g. "wl" -> wl_display).
-void *mpgl_get_native_display(struct GL *gl, const char *name);
-
 //function pointers loaded from the OpenGL library
 struct GL {
     int version;                // MPGL_VER() mangled (e.g. 210 for 2.1)
@@ -90,11 +87,6 @@ struct GL {
     int mpgl_caps;              // Bitfield of MPGL_CAP_* constants
     bool debug_context;         // use of e.g. GLX_CONTEXT_DEBUG_BIT_ARB
 
-    // Use mpgl_get_native_display() instead. Also, this is set to use the
-    // fields in MPGLContext by default (if set).
-    void *get_native_display_ctx;
-    void *(*get_native_display)(void *ctx, const char *name);
-
     void (GLAPIENTRY *Viewport)(GLint, GLint, GLsizei, GLsizei);
     void (GLAPIENTRY *Clear)(GLbitfield);
     void (GLAPIENTRY *GenTextures)(GLsizei, GLuint *);
diff --git a/video/out/opengl/context.c b/video/out/opengl/context.c
index cdaf632..43b57aa 100644
--- a/video/out/opengl/context.c
+++ b/video/out/opengl/context.c
@@ -125,17 +125,6 @@ done:
     return ret;
 }
 
-static void *get_native_display(void *priv, const char *name)
-{
-    struct priv *p = priv;
-    if (!p->params.native_display_type || !name)
-        return NULL;
-    if (strcmp(p->params.native_display_type, name) != 0)
-        return NULL;
-
-    return p->params.native_display;
-}
-
 void ra_gl_ctx_uninit(struct ra_ctx *ctx)
 {
     if (ctx->swapchain) {
@@ -173,8 +162,6 @@ bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params)
     if (ext) {
         if (ext->color_depth)
             p->fns.color_depth = ext->color_depth;
-        if (ext->screenshot)
-            p->fns.screenshot = ext->screenshot;
         if (ext->start_frame)
             p->fns.start_frame = ext->start_frame;
         if (ext->submit_frame)
@@ -193,8 +180,6 @@ bool ra_gl_ctx_init(struct ra_ctx *ctx, GL *gl, struct ra_gl_ctx_params params)
     }
 
     gl->debug_context = ctx->opts.debug;
-    gl->get_native_display_ctx = p;
-    gl->get_native_display = get_native_display;
 
     if (gl->SwapInterval) {
         gl->SwapInterval(p->opts->swapinterval);
@@ -247,24 +232,6 @@ int ra_gl_ctx_color_depth(struct ra_swapchain *sw)
     return depth_g;
 }
 
-struct mp_image *ra_gl_ctx_screenshot(struct ra_swapchain *sw)
-{
-    struct priv *p = sw->priv;
-
-    assert(p->wrapped_fb);
-    struct mp_image *screen = gl_read_fbo_contents(p->gl, p->main_fb,
-                                                   p->wrapped_fb->params.w,
-                                                   p->wrapped_fb->params.h);
-
-    // OpenGL FB is also read in flipped order, so we need to flip when the
-    // rendering is *not* flipped, which in our case is whenever
-    // p->params.flipped is true. I hope that made sense
-    if (screen && p->params.flipped)
-        mp_image_vflip(screen);
-
-    return screen;
-}
-
 bool ra_gl_ctx_start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
 {
     struct priv *p = sw->priv;
@@ -348,7 +315,6 @@ void ra_gl_ctx_swap_buffers(struct ra_swapchain *sw)
 
 static const struct ra_swapchain_fns ra_gl_swapchain_fns = {
     .color_depth   = ra_gl_ctx_color_depth,
-    .screenshot    = ra_gl_ctx_screenshot,
     .start_frame   = ra_gl_ctx_start_frame,
     .submit_frame  = ra_gl_ctx_submit_frame,
     .swap_buffers  = ra_gl_ctx_swap_buffers,
diff --git a/video/out/opengl/context.h b/video/out/opengl/context.h
index 95ed374..5fccc70 100644
--- a/video/out/opengl/context.h
+++ b/video/out/opengl/context.h
@@ -34,10 +34,6 @@ struct ra_gl_ctx_params {
     // ra_swapchain_fns structs will entirely replace the equivalent ra_gl_ctx
     // functions in the resulting ra_swapchain.
     const struct ra_swapchain_fns *external_swapchain;
-
-    // For hwdec_vaegl.c:
-    const char *native_display_type;
-    void *native_display;
 };
 
 void ra_gl_ctx_uninit(struct ra_ctx *ctx);
diff --git a/video/out/opengl/context_android.c b/video/out/opengl/context_android.c
index a2acce2..d405e79 100644
--- a/video/out/opengl/context_android.c
+++ b/video/out/opengl/context_android.c
@@ -26,6 +26,19 @@
 #include "options/m_config.h"
 #include "context.h"
 
+struct android_opts {
+    struct m_geometry surface_size;
+};
+
+#define OPT_BASE_STRUCT struct android_opts
+const struct m_sub_options android_conf = {
+    .opts = (const struct m_option[]) {
+        OPT_SIZE_BOX("android-surface-size", surface_size, UPDATE_VO_RESIZE),
+        {0}
+    },
+    .size = sizeof(struct android_opts),
+};
+
 struct priv {
     struct GL gl;
     EGLDisplay egl_display;
@@ -123,10 +136,16 @@ fail:
 static bool android_reconfig(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    int w, h;
+    void *tmp = talloc_new(NULL);
+    struct android_opts *opts = mp_get_config_group(tmp, ctx->global, &android_conf);
+    int w = opts->surface_size.w, h = opts->surface_size.h;
+
+    if (!w)
+        eglQuerySurface(p->egl_display, p->egl_surface, EGL_WIDTH, &w);
+    if (!h)
+        eglQuerySurface(p->egl_display, p->egl_surface, EGL_HEIGHT, &h);
 
-    if (!eglQuerySurface(p->egl_display, p->egl_surface, EGL_WIDTH, &w) ||
-        !eglQuerySurface(p->egl_display, p->egl_surface, EGL_HEIGHT, &h)) {
+    if (!w || !h) {
         MP_FATAL(ctx, "Failed to get height and width!\n");
         return false;
     }
@@ -134,6 +153,8 @@ static bool android_reconfig(struct ra_ctx *ctx)
     ctx->vo->dwidth = w;
     ctx->vo->dheight = h;
     ra_gl_ctx_resize(ctx->swapchain, w, h, 0);
+
+    talloc_free(tmp);
     return true;
 }
 
diff --git a/video/out/opengl/context_angle.c b/video/out/opengl/context_angle.c
index 986a503..6d45e29 100644
--- a/video/out/opengl/context_angle.c
+++ b/video/out/opengl/context_angle.c
@@ -525,17 +525,6 @@ static int angle_color_depth(struct ra_swapchain *sw)
     return 8;
 }
 
-static struct mp_image *angle_screenshot(struct ra_swapchain *sw)
-{
-    struct priv *p = sw->ctx->priv;
-    if (p->dxgi_swapchain) {
-        struct mp_image *img = mp_d3d11_screenshot(p->dxgi_swapchain);
-        if (img)
-            return img;
-    }
-    return ra_gl_ctx_screenshot(sw);
-}
-
 static bool angle_submit_frame(struct ra_swapchain *sw,
                                const struct vo_frame *frame)
 {
@@ -611,7 +600,6 @@ static bool angle_init(struct ra_ctx *ctx)
     // Custom swapchain impl for the D3D11 swapchain-based surface
     static const struct ra_swapchain_fns dxgi_swapchain_fns = {
         .color_depth = angle_color_depth,
-        .screenshot = angle_screenshot,
         .submit_frame = angle_submit_frame,
     };
     struct ra_gl_ctx_params params = {
diff --git a/video/out/opengl/context_cocoa.c b/video/out/opengl/context_cocoa.c
index 2256d31..b73ca9d 100644
--- a/video/out/opengl/context_cocoa.c
+++ b/video/out/opengl/context_cocoa.c
@@ -19,7 +19,6 @@
 #include <dlfcn.h>
 #include "options/m_config.h"
 #include "video/out/cocoa_common.h"
-#include "osdep/macosx_versions.h"
 #include "context.h"
 
 struct cocoa_opts {
@@ -37,6 +36,7 @@ const struct m_sub_options cocoa_conf = {
 
 struct priv {
     GL gl;
+    void (GLAPIENTRY *Flush)(void);
     CGLPixelFormatObj pix;
     CGLContextObj ctx;
 
@@ -50,6 +50,8 @@ static int set_swap_interval(int enabled)
     return (err == kCGLNoError) ? 0 : -1;
 }
 
+static void glFlushDummy(void) { }
+
 static void *cocoa_glgetaddr(const char *s)
 {
     void *ret = NULL;
@@ -139,6 +141,8 @@ static bool create_gl_context(struct ra_ctx *ctx)
 
     mpgl_load_functions(gl, (void *)cocoa_glgetaddr, NULL, ctx->vo->log);
     gl->SwapInterval = set_swap_interval;
+    p->Flush = gl->Flush;
+    gl->Flush = glFlushDummy;
 
     CGLReleasePixelFormat(p->pix);
 
@@ -156,9 +160,8 @@ static void cocoa_uninit(struct ra_ctx *ctx)
 static void cocoa_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
-    GL *gl = &p->gl;
     vo_cocoa_swap_buffers(ctx->vo);
-    gl->Flush();
+    p->Flush();
 }
 
 static bool cocoa_init(struct ra_ctx *ctx)
@@ -168,6 +171,8 @@ static bool cocoa_init(struct ra_ctx *ctx)
     p->opts = mp_get_config_group(ctx, ctx->global, &cocoa_conf);
     vo_cocoa_init(ctx->vo);
 
+    MP_WARN(ctx->vo, "opengl cocoa backend is deprecated, use vo=libmpv instead\n");
+
     if (!create_gl_context(ctx))
         goto fail;
 
diff --git a/video/out/opengl/context_drm_egl.c b/video/out/opengl/context_drm_egl.c
index 6191309..72eb2e3 100644
--- a/video/out/opengl/context_drm_egl.c
+++ b/video/out/opengl/context_drm_egl.c
@@ -25,13 +25,13 @@
 #include <unistd.h>
 
 #include <gbm.h>
-#include <drm_fourcc.h>
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
-#include "libmpv/opengl_cb.h"
+#include "libmpv/render_gl.h"
 #include "video/out/drm_common.h"
 #include "common/common.h"
+#include "osdep/timer.h"
 
 #include "egl_helpers.h"
 #include "common.h"
@@ -72,7 +72,7 @@ struct priv {
     struct gbm gbm;
     struct framebuffer *fb;
 
-    uint32_t primary_plane_format;
+    uint32_t gbm_format;
 
     bool active;
     bool waiting_for_flip;
@@ -80,9 +80,80 @@ struct priv {
     bool vt_switcher_active;
     struct vt_switcher vt_switcher;
 
-    struct mpv_opengl_cb_drm_params drm_params;
+    struct mpv_opengl_drm_params drm_params;
+    struct mpv_opengl_drm_osd_size osd_size;
 };
 
+// Not general. Limited to only the formats being used in this module
+static const char *gbm_format_to_string(uint32_t format)
+{
+    switch (format) {
+    case GBM_FORMAT_XRGB8888:
+        return "GBM_FORMAT_XRGB8888";
+    case GBM_FORMAT_ARGB8888:
+        return "GBM_FORMAT_ARGB8888";
+    case GBM_FORMAT_XRGB2101010:
+        return "GBM_FORMAT_XRGB2101010";
+    case GBM_FORMAT_ARGB2101010:
+        return "GBM_FORMAT_ARGB2101010";
+    default:
+        return "UNKNOWN";
+    }
+}
+
+// Allow falling back to an ARGB EGLConfig when we have an XRGB framebuffer.
+// Also allow falling back to an XRGB EGLConfig for ARGB framebuffers, since
+// this seems neccessary to work with broken Mali drivers that don't report
+// their EGLConfigs as supporting alpha properly.
+static uint32_t fallback_format_for(uint32_t format)
+{
+    switch (format) {
+    case GBM_FORMAT_XRGB8888:
+        return GBM_FORMAT_ARGB8888;
+    case GBM_FORMAT_ARGB8888:
+        return GBM_FORMAT_XRGB8888;
+    case GBM_FORMAT_XRGB2101010:
+        return GBM_FORMAT_ARGB2101010;
+    case GBM_FORMAT_ARGB2101010:
+        return GBM_FORMAT_XRGB2101010;
+    default:
+        return 0;
+    }
+}
+
+static int match_config_to_visual(void *user_data, EGLConfig *configs, int num_configs)
+{
+    struct ra_ctx *ctx = (struct ra_ctx*)user_data;
+    struct priv *p = ctx->priv;
+    const EGLint visual_id[] = {
+        (EGLint)p->gbm_format,
+        (EGLint)fallback_format_for(p->gbm_format),
+        0
+    };
+
+    for (unsigned int i = 0; visual_id[i] != 0; ++i) {
+        MP_VERBOSE(ctx, "Attempting to find EGLConfig matching %s\n",
+                   gbm_format_to_string(visual_id[i]));
+        for (unsigned int j = 0; j < num_configs; ++j) {
+            EGLint id;
+
+            if (!eglGetConfigAttrib(p->egl.display, configs[j], EGL_NATIVE_VISUAL_ID, &id))
+                continue;
+
+            if (visual_id[i] == id) {
+                MP_VERBOSE(ctx, "Found matching EGLConfig for %s\n",
+                           gbm_format_to_string(visual_id[i]));
+                return j;
+            }
+        }
+        MP_VERBOSE(ctx, "No matching EGLConfig for %s\n", gbm_format_to_string(visual_id[i]));
+    }
+
+    MP_ERR(ctx, "Could not find EGLConfig matching the GBM visual (%s).\n",
+           gbm_format_to_string(p->gbm_format));
+    return -1;
+}
+
 static bool init_egl(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
@@ -97,7 +168,11 @@ static bool init_egl(struct ra_ctx *ctx)
         return false;
     }
     EGLConfig config;
-    if (!mpegl_create_context(ctx, p->egl.display, &p->egl.context, &config))
+    if (!mpegl_create_context_cb(ctx,
+                                 p->egl.display,
+                                 (struct mpegl_cb){match_config_to_visual, ctx},
+                                 &p->egl.context,
+                                 &config))
         return false;
     MP_VERBOSE(ctx, "Initializing EGL surface\n");
     p->egl.surface
@@ -120,12 +195,12 @@ static bool init_gbm(struct ra_ctx *ctx)
     }
 
     MP_VERBOSE(ctx->vo, "Initializing GBM surface (%d x %d)\n",
-        p->kms->mode.hdisplay, p->kms->mode.vdisplay);
+        p->osd_size.width, p->osd_size.height);
     p->gbm.surface = gbm_surface_create(
         p->gbm.device,
-        p->kms->mode.hdisplay,
-        p->kms->mode.vdisplay,
-        p->primary_plane_format, // drm_fourcc.h defs should be gbm-compatible
+        p->osd_size.width,
+        p->osd_size.height,
+        p->gbm_format,
         GBM_BO_USE_SCANOUT | GBM_BO_USE_RENDERING);
     if (!p->gbm.surface) {
         MP_ERR(ctx->vo, "Failed to create GBM surface.\n");
@@ -159,7 +234,7 @@ static void update_framebuffer_from_bo(struct ra_ctx *ctx, struct gbm_bo *bo)
     uint32_t handle = gbm_bo_get_handle(bo).u32;
 
     int ret = drmModeAddFB2(fb->fd, fb->width, fb->height,
-                            p->primary_plane_format,
+                            p->gbm_format,
                             (uint32_t[4]){handle, 0, 0, 0},
                             (uint32_t[4]){stride, 0, 0, 0},
                             (uint32_t[4]){0, 0, 0, 0},
@@ -172,17 +247,104 @@ static void update_framebuffer_from_bo(struct ra_ctx *ctx, struct gbm_bo *bo)
     p->fb = fb;
 }
 
+static bool crtc_setup_atomic(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    struct drm_atomic_context *atomic_ctx = p->kms->atomic_context;
+
+    if (!drm_atomic_save_old_state(atomic_ctx)) {
+        MP_WARN(ctx->vo, "Failed to save old DRM atomic state\n");
+    }
+
+    drmModeAtomicReqPtr request = drmModeAtomicAlloc();
+    if (!request) {
+        MP_ERR(ctx->vo, "Failed to allocate drm atomic request\n");
+        return false;
+    }
+
+    if (drm_object_set_property(request, atomic_ctx->connector, "CRTC_ID", p->kms->crtc_id) < 0) {
+        MP_ERR(ctx->vo, "Could not set CRTC_ID on connector\n");
+        return false;
+    }
+
+    if (!drm_mode_ensure_blob(p->kms->fd, &p->kms->mode)) {
+        MP_ERR(ctx->vo, "Failed to create DRM mode blob\n");
+        goto err;
+    }
+    if (drm_object_set_property(request, atomic_ctx->crtc, "MODE_ID", p->kms->mode.blob_id) < 0) {
+        MP_ERR(ctx->vo, "Could not set MODE_ID on crtc\n");
+        goto err;
+    }
+    if (drm_object_set_property(request, atomic_ctx->crtc, "ACTIVE", 1) < 0) {
+        MP_ERR(ctx->vo, "Could not set ACTIVE on crtc\n");
+        goto err;
+    }
+
+    drm_object_set_property(request, atomic_ctx->osd_plane, "FB_ID", p->fb->id);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "CRTC_ID", p->kms->crtc_id);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "SRC_X",   0);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "SRC_Y",   0);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "SRC_W",   p->osd_size.width << 16);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "SRC_H",   p->osd_size.height << 16);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "CRTC_X",  0);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "CRTC_Y",  0);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "CRTC_W",  p->kms->mode.mode.hdisplay);
+    drm_object_set_property(request, atomic_ctx->osd_plane, "CRTC_H",  p->kms->mode.mode.vdisplay);
+
+    int ret = drmModeAtomicCommit(p->kms->fd, request, DRM_MODE_ATOMIC_ALLOW_MODESET, NULL);
+    if (ret)
+        MP_ERR(ctx->vo, "Failed to commit ModeSetting atomic request (%d)\n", ret);
+
+    drmModeAtomicFree(request);
+    return ret == 0;
+
+  err:
+    drmModeAtomicFree(request);
+    return false;
+}
+
+static bool crtc_release_atomic(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    struct drm_atomic_context *atomic_ctx = p->kms->atomic_context;
+    drmModeAtomicReqPtr request = drmModeAtomicAlloc();
+    if (!request) {
+        MP_ERR(ctx->vo, "Failed to allocate drm atomic request\n");
+        return false;
+    }
+
+    if (!drm_atomic_restore_old_state(request, atomic_ctx)) {
+        MP_WARN(ctx->vo, "Got error while restoring old state\n");
+    }
+
+    int ret = drmModeAtomicCommit(p->kms->fd, request, DRM_MODE_ATOMIC_ALLOW_MODESET, NULL);
+
+    if (ret)
+        MP_WARN(ctx->vo, "Failed to commit ModeSetting atomic request (%d)\n", ret);
+
+    drmModeAtomicFree(request);
+    return ret == 0;
+}
+
 static bool crtc_setup(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
     if (p->active)
         return true;
-    p->old_crtc = drmModeGetCrtc(p->kms->fd, p->kms->crtc_id);
-    int ret = drmModeSetCrtc(p->kms->fd, p->kms->crtc_id, p->fb->id,
-                             0, 0, &p->kms->connector->connector_id, 1,
-                             &p->kms->mode);
-    p->active = true;
-    return ret == 0;
+
+    if (p->kms->atomic_context) {
+        int ret = crtc_setup_atomic(ctx);
+        p->active = true;
+        return ret;
+    } else {
+        p->old_crtc = drmModeGetCrtc(p->kms->fd, p->kms->crtc_id);
+        int ret = drmModeSetCrtc(p->kms->fd, p->kms->crtc_id, p->fb->id,
+                                 0, 0, &p->kms->connector->connector_id, 1,
+                                 &p->kms->mode.mode);
+        p->active = true;
+        return ret == 0;
+    }
 }
 
 static void crtc_release(struct ra_ctx *ctx)
@@ -202,21 +364,28 @@ static void crtc_release(struct ra_ctx *ctx)
         }
     }
 
-    if (p->old_crtc) {
-        drmModeSetCrtc(p->kms->fd,
-                       p->old_crtc->crtc_id, p->old_crtc->buffer_id,
-                       p->old_crtc->x, p->old_crtc->y,
-                       &p->kms->connector->connector_id, 1,
-                       &p->old_crtc->mode);
-        drmModeFreeCrtc(p->old_crtc);
-        p->old_crtc = NULL;
+    if (p->kms->atomic_context) {
+        if (p->kms->atomic_context->old_state.saved) {
+            if (!crtc_release_atomic(ctx))
+                MP_ERR(ctx->vo, "Failed to restore previous mode\n");
+        }
+    } else {
+        if (p->old_crtc) {
+            drmModeSetCrtc(p->kms->fd,
+                           p->old_crtc->crtc_id, p->old_crtc->buffer_id,
+                           p->old_crtc->x, p->old_crtc->y,
+                           &p->kms->connector->connector_id, 1,
+                           &p->old_crtc->mode);
+            drmModeFreeCrtc(p->old_crtc);
+            p->old_crtc = NULL;
+        }
     }
 }
 
 static void release_vt(void *data)
 {
     struct ra_ctx *ctx = data;
-    MP_VERBOSE(ctx->vo, "Releasing VT");
+    MP_VERBOSE(ctx->vo, "Releasing VT\n");
     crtc_release(ctx);
     if (USE_MASTER) {
         //this function enables support for switching to x, weston etc.
@@ -233,7 +402,7 @@ static void release_vt(void *data)
 static void acquire_vt(void *data)
 {
     struct ra_ctx *ctx = data;
-    MP_VERBOSE(ctx->vo, "Acquiring VT");
+    MP_VERBOSE(ctx->vo, "Acquiring VT\n");
     if (USE_MASTER) {
         struct priv *p = ctx->priv;
         if (drmSetMaster(p->kms->fd)) {
@@ -249,8 +418,10 @@ static bool drm_atomic_egl_start_frame(struct ra_swapchain *sw, struct ra_fbo *o
 {
     struct priv *p = sw->ctx->priv;
     if (p->kms->atomic_context) {
-        p->kms->atomic_context->request = drmModeAtomicAlloc();
-        p->drm_params.atomic_request = p->kms->atomic_context->request;
+        if (!p->kms->atomic_context->request) {
+            p->kms->atomic_context->request = drmModeAtomicAlloc();
+            p->drm_params.atomic_request_ptr = &p->kms->atomic_context->request;
+        }
         return ra_gl_ctx_start_frame(sw, out_fbo);
     }
     return false;
@@ -266,15 +437,18 @@ static void drm_egl_swap_buffers(struct ra_ctx *ctx)
     struct drm_atomic_context *atomic_ctx = p->kms->atomic_context;
     int ret;
 
+    if (!p->active)
+        return;
+
     eglSwapBuffers(p->egl.display, p->egl.surface);
     p->gbm.next_bo = gbm_surface_lock_front_buffer(p->gbm.surface);
     p->waiting_for_flip = true;
     update_framebuffer_from_bo(ctx, p->gbm.next_bo);
 
     if (atomic_ctx) {
-        drm_object_set_property(atomic_ctx->request, atomic_ctx->primary_plane, "FB_ID", p->fb->id);
-        drm_object_set_property(atomic_ctx->request, atomic_ctx->primary_plane, "CRTC_ID", atomic_ctx->crtc->id);
-        drm_object_set_property(atomic_ctx->request, atomic_ctx->primary_plane, "ZPOS", 1);
+        drm_object_set_property(atomic_ctx->request, atomic_ctx->osd_plane, "FB_ID", p->fb->id);
+        drm_object_set_property(atomic_ctx->request, atomic_ctx->osd_plane, "CRTC_ID", atomic_ctx->crtc->id);
+        drm_object_set_property(atomic_ctx->request, atomic_ctx->osd_plane, "ZPOS", 1);
 
         ret = drmModeAtomicCommit(p->kms->fd, atomic_ctx->request,
                                   DRM_MODE_ATOMIC_NONBLOCK | DRM_MODE_PAGE_FLIP_EVENT, NULL);
@@ -304,7 +478,7 @@ static void drm_egl_swap_buffers(struct ra_ctx *ctx)
 
     if (atomic_ctx) {
         drmModeAtomicFree(atomic_ctx->request);
-        p->drm_params.atomic_request = atomic_ctx->request = NULL;
+        atomic_ctx->request = drmModeAtomicAlloc();
     }
 
     gbm_surface_release_buffer(p->gbm.surface, p->gbm.bo);
@@ -314,6 +488,15 @@ static void drm_egl_swap_buffers(struct ra_ctx *ctx)
 static void drm_egl_uninit(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
+    struct drm_atomic_context *atomic_ctx = p->kms->atomic_context;
+
+    if (atomic_ctx) {
+        int ret = drmModeAtomicCommit(p->kms->fd, atomic_ctx->request, 0, NULL);
+        if (ret)
+            MP_ERR(ctx->vo, "Failed to commit atomic request (%d)\n", ret);
+        drmModeAtomicFree(atomic_ctx->request);
+    }
+
     ra_gl_ctx_uninit(ctx);
 
     crtc_release(ctx);
@@ -330,48 +513,51 @@ static void drm_egl_uninit(struct ra_ctx *ctx)
     p->egl.context = EGL_NO_CONTEXT;
     eglDestroyContext(p->egl.display, p->egl.context);
 
+    close(p->drm_params.render_fd);
+
     if (p->kms) {
         kms_destroy(p->kms);
         p->kms = 0;
     }
 }
 
-// If primary plane supports ARGB8888 we want to use that, but if it doesn't we
-// fall back on XRGB8888. If the driver does not support atomic there is no
-// particular reason to be using ARGB8888, so we fall back to XRGB8888 (another
-// reason is that we do not have the convenient atomic_ctx and its convenient
-// primary_plane field).
-static bool probe_primary_plane_format(struct ra_ctx *ctx)
+// If the OSD plane supports ARGB we want to use that, but if it doesn't we fall
+// back on XRGB. If the driver does not support atomic there is no particular
+// reason to be using ARGB (drmprime hwdec will not work without atomic,
+// anyway), so we fall back to XRGB (another reason is that we do not have the
+// convenient atomic_ctx and its convenient plane fields).
+static bool probe_gbm_format(struct ra_ctx *ctx, uint32_t argb_format, uint32_t xrgb_format)
 {
     struct priv *p = ctx->priv;
+
     if (!p->kms->atomic_context) {
-        p->primary_plane_format = DRM_FORMAT_XRGB8888;
-        MP_VERBOSE(ctx->vo, "Not using DRM Atomic: Use DRM_FORMAT_XRGB8888 for primary plane.\n");
+        p->gbm_format = xrgb_format;
+        MP_VERBOSE(ctx->vo, "Not using DRM Atomic: Use %s for OSD plane.\n",
+                   gbm_format_to_string(xrgb_format));
         return true;
     }
 
     drmModePlane *drmplane =
-        drmModeGetPlane(p->kms->fd, p->kms->atomic_context->primary_plane->id);
-    bool have_argb8888 = false;
-    bool have_xrgb8888 = false;
+        drmModeGetPlane(p->kms->fd, p->kms->atomic_context->osd_plane->id);
+    bool have_argb = false;
+    bool have_xrgb = false;
     bool result = false;
     for (unsigned int i = 0; i < drmplane->count_formats; ++i) {
-        if (drmplane->formats[i] == DRM_FORMAT_ARGB8888) {
-            have_argb8888 = true;
-        } else if (drmplane->formats[i] == DRM_FORMAT_XRGB8888) {
-            have_xrgb8888 = true;
+        if (drmplane->formats[i] == argb_format) {
+            have_argb = true;
+        } else if (drmplane->formats[i] == xrgb_format) {
+            have_xrgb = true;
         }
     }
 
-    if (have_argb8888) {
-        p->primary_plane_format = DRM_FORMAT_ARGB8888;
-        MP_VERBOSE(ctx->vo, "DRM_FORMAT_ARGB8888 supported by primary plane.\n");
+    if (have_argb) {
+        p->gbm_format = argb_format;
+        MP_VERBOSE(ctx->vo, "%s supported by OSD plane.\n", gbm_format_to_string(argb_format));
         result = true;
-    } else if (have_xrgb8888) {
-        p->primary_plane_format = DRM_FORMAT_XRGB8888;
-        MP_VERBOSE(ctx->vo,
-                   "DRM_FORMAT_ARGB8888 not supported by primary plane: "
-                   "Falling back to DRM_FORMAT_XRGB8888.\n");
+    } else if (have_xrgb) {
+        p->gbm_format = xrgb_format;
+        MP_VERBOSE(ctx->vo, "%s not supported by OSD plane: Falling back to %s.\n",
+                   gbm_format_to_string(argb_format), gbm_format_to_string(xrgb_format));
         result = true;
     }
 
@@ -400,14 +586,40 @@ static bool drm_egl_init(struct ra_ctx *ctx)
     MP_VERBOSE(ctx, "Initializing KMS\n");
     p->kms = kms_create(ctx->log, ctx->vo->opts->drm_opts->drm_connector_spec,
                         ctx->vo->opts->drm_opts->drm_mode_id,
-                        ctx->vo->opts->drm_opts->drm_overlay_id);
+                        ctx->vo->opts->drm_opts->drm_osd_plane_id,
+                        ctx->vo->opts->drm_opts->drm_video_plane_id);
     if (!p->kms) {
         MP_ERR(ctx, "Failed to create KMS.\n");
         return false;
     }
 
-    if (!probe_primary_plane_format(ctx)) {
-        MP_ERR(ctx->vo, "No suitable format found on DRM primary plane.\n");
+    if (ctx->vo->opts->drm_opts->drm_osd_size.wh_valid) {
+        if (p->kms->atomic_context) {
+            p->osd_size.width = ctx->vo->opts->drm_opts->drm_osd_size.w;
+            p->osd_size.height = ctx->vo->opts->drm_opts->drm_osd_size.h;
+        } else {
+            p->osd_size.width = p->kms->mode.mode.hdisplay;
+            p->osd_size.height = p->kms->mode.mode.vdisplay;
+            MP_WARN(ctx, "Setting OSD size is only available with DRM atomic, defaulting to screen resolution\n");
+        }
+    } else {
+        p->osd_size.width = p->kms->mode.mode.hdisplay;
+        p->osd_size.height = p->kms->mode.mode.vdisplay;
+    }
+
+    uint32_t argb_format;
+    uint32_t xrgb_format;
+    if (DRM_OPTS_FORMAT_XRGB2101010 == ctx->vo->opts->drm_opts->drm_format) {
+        argb_format = GBM_FORMAT_ARGB2101010;
+        xrgb_format = GBM_FORMAT_XRGB2101010;
+    } else {
+        argb_format = GBM_FORMAT_ARGB8888;
+        xrgb_format = GBM_FORMAT_XRGB8888;
+    }
+
+    if (!probe_gbm_format(ctx, argb_format, xrgb_format)) {
+        MP_ERR(ctx->vo, "No suitable format found on DRM primary plane (tried: %s and %s).\n",
+               gbm_format_to_string(argb_format), gbm_format_to_string(xrgb_format));
         return false;
     }
 
@@ -451,18 +663,34 @@ static bool drm_egl_init(struct ra_ctx *ctx)
 
     p->drm_params.fd = p->kms->fd;
     p->drm_params.crtc_id = p->kms->crtc_id;
+    p->drm_params.connector_id = p->kms->connector->connector_id;
     if (p->kms->atomic_context)
-        p->drm_params.atomic_request = p->kms->atomic_context->request;
+        p->drm_params.atomic_request_ptr = &p->kms->atomic_context->request;
+    char *rendernode_path = drmGetRenderDeviceNameFromFd(p->kms->fd);
+    if (rendernode_path) {
+        MP_VERBOSE(ctx, "Opening render node \"%s\"\n", rendernode_path);
+        p->drm_params.render_fd = open(rendernode_path, O_RDWR | O_CLOEXEC);
+        if (p->drm_params.render_fd < 0) {
+            MP_WARN(ctx, "Cannot open render node \"%s\": %s. VAAPI hwdec will be disabled\n",
+                    rendernode_path, mp_strerror(errno));
+        }
+        free(rendernode_path);
+    } else {
+        p->drm_params.render_fd = -1;
+        MP_VERBOSE(ctx, "Could not find path to render node. VAAPI hwdec will be disabled\n");
+    }
+
     struct ra_gl_ctx_params params = {
         .swap_buffers = drm_egl_swap_buffers,
-        .native_display_type = "opengl-cb-drm-params",
-        .native_display = &p->drm_params,
         .external_swapchain = p->kms->atomic_context ? &drm_atomic_swapchain :
                                                        NULL,
     };
     if (!ra_gl_ctx_init(ctx, &p->gl, params))
         return false;
 
+    ra_add_native_resource(ctx->ra, "drm_params", &p->drm_params);
+    ra_add_native_resource(ctx->ra, "drm_osd_size", &p->osd_size);
+
     return true;
 }
 
@@ -491,6 +719,25 @@ static int drm_egl_control(struct ra_ctx *ctx, int *events, int request,
     return VO_NOTIMPL;
 }
 
+static void wait_events(struct ra_ctx *ctx, int64_t until_time_us)
+{
+    struct priv *p = ctx->priv;
+    if (p->vt_switcher_active) {
+        int64_t wait_us = until_time_us - mp_time_us();
+        int timeout_ms = MPCLAMP((wait_us + 500) / 1000, 0, 10000);
+        vt_switcher_poll(&p->vt_switcher, timeout_ms);
+    } else {
+        vo_wait_default(ctx->vo, until_time_us);
+    }
+}
+
+static void wakeup(struct ra_ctx *ctx)
+{
+    struct priv *p = ctx->priv;
+    if (p->vt_switcher_active)
+        vt_switcher_interrupt_poll(&p->vt_switcher);
+}
+
 const struct ra_ctx_fns ra_ctx_drm_egl = {
     .type           = "opengl",
     .name           = "drm",
@@ -498,4 +745,6 @@ const struct ra_ctx_fns ra_ctx_drm_egl = {
     .control        = drm_egl_control,
     .init           = drm_egl_init,
     .uninit         = drm_egl_uninit,
+    .wait_events    = wait_events,
+    .wakeup         = wakeup,
 };
diff --git a/video/out/opengl/context_dxinterop.c b/video/out/opengl/context_dxinterop.c
index 85d84bf..2e65a89 100644
--- a/video/out/opengl/context_dxinterop.c
+++ b/video/out/opengl/context_dxinterop.c
@@ -481,20 +481,6 @@ static int GLAPIENTRY dxgl_swap_interval(int interval)
     return 1;
 }
 
-static void * GLAPIENTRY dxgl_get_native_display(const char *name)
-{
-    if (!current_ctx || !name)
-        return NULL;
-    struct priv *p = current_ctx->priv;
-
-    if (p->device && strcmp("IDirect3DDevice9Ex", name) == 0) {
-        return p->device;
-    } else if (p->device_h && strcmp("dxinterop_device_HANDLE", name) == 0) {
-        return p->device_h;
-    }
-    return NULL;
-}
-
 static void dxgl_swap_buffers(struct ra_ctx *ctx)
 {
     struct priv *p = ctx->priv;
@@ -560,7 +546,6 @@ static bool dxgl_init(struct ra_ctx *ctx)
 
     current_ctx = ctx;
     gl->SwapInterval = dxgl_swap_interval;
-    gl->MPGetNativeDisplay = dxgl_get_native_display;
 
     if (d3d_create(ctx) < 0)
         goto fail;
@@ -577,6 +562,9 @@ static bool dxgl_init(struct ra_ctx *ctx)
     if (!ra_gl_ctx_init(ctx, gl, params))
         goto fail;
 
+    ra_add_native_resource(ctx->ra, "IDirect3DDevice9Ex", p->device);
+    ra_add_native_resource(ctx->ra, "dxinterop_device_HANDLE", p->device_h);
+
     DwmEnableMMCSS(TRUE);
     return true;
 fail:
diff --git a/video/out/opengl/context_rpi.c b/video/out/opengl/context_rpi.c
index 8b447d0..fbd9721 100644
--- a/video/out/opengl/context_rpi.c
+++ b/video/out/opengl/context_rpi.c
@@ -198,7 +198,8 @@ static bool recreate_dispmanx(struct ra_ctx *ctx)
 
     ctx->vo->dwidth = p->w;
     ctx->vo->dheight = p->h;
-    ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
+    if (ctx->swapchain)
+        ra_gl_ctx_resize(ctx->swapchain, p->w, p->h, 0);
 
     ctx->vo->want_redraw = true;
 
@@ -240,13 +241,14 @@ static bool rpi_init(struct ra_ctx *ctx)
 
     struct ra_gl_ctx_params params = {
         .swap_buffers = rpi_swap_buffers,
-        .native_display_type = "MPV_RPI_WINDOW",
-        .native_display = p->win_params,
     };
 
     if (!ra_gl_ctx_init(ctx, &p->gl, params))
         goto fail;
 
+    ra_add_native_resource(ctx->ra, "MPV_RPI_WINDOW", p->win_params);
+
+    ra_gl_ctx_resize(ctx->swapchain, ctx->vo->dwidth, ctx->vo->dheight, 0);
     return true;
 
 fail:
diff --git a/video/out/opengl/context_wayland.c b/video/out/opengl/context_wayland.c
index f686fcc..650072c 100644
--- a/video/out/opengl/context_wayland.c
+++ b/video/out/opengl/context_wayland.c
@@ -78,13 +78,13 @@ static bool egl_create_context(struct ra_ctx *ctx)
 
     struct ra_gl_ctx_params params = {
         .swap_buffers = wayland_egl_swap_buffers,
-        .native_display_type = "wl",
-        .native_display = wl->display,
     };
 
     if (!ra_gl_ctx_init(ctx, &p->gl, params))
         return false;
 
+    ra_add_native_resource(ctx->ra, "wl", wl->display);
+
     return true;
 }
 
diff --git a/video/out/opengl/context_x11egl.c b/video/out/opengl/context_x11egl.c
index 7ab4fe0..32530cc 100644
--- a/video/out/opengl/context_x11egl.c
+++ b/video/out/opengl/context_x11egl.c
@@ -142,13 +142,13 @@ static bool mpegl_init(struct ra_ctx *ctx)
 
     struct ra_gl_ctx_params params = {
         .swap_buffers = mpegl_swap_buffers,
-        .native_display_type = "x11",
-        .native_display = vo->x11->display,
     };
 
     if (!ra_gl_ctx_init(ctx, &p->gl, params))
         goto uninit;
 
+    ra_add_native_resource(ctx->ra, "x11", vo->x11->display);
+
     return true;
 
 uninit:
diff --git a/video/out/opengl/cuda_dynamic.c b/video/out/opengl/cuda_dynamic.c
deleted file mode 100644
index 1135a1f..0000000
--- a/video/out/opengl/cuda_dynamic.c
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "cuda_dynamic.h"
-
-#include <pthread.h>
-
-#if defined(_WIN32)
-# include <windows.h>
-# define dlopen(filename, flags) LoadLibrary(TEXT(filename))
-# define dlsym(handle, symbol) (void *)GetProcAddress(handle, symbol)
-# define dlclose(handle) FreeLibrary(handle)
-#else
-# include <dlfcn.h>
-#endif
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-# define CUDA_LIBNAME "nvcuda.dll"
-#else
-# define CUDA_LIBNAME "libcuda.so.1"
-#endif
-
-#define CUDA_DECL(NAME, TYPE) \
-    TYPE *mpv_ ## NAME;
-CUDA_FNS(CUDA_DECL)
-
-static bool cuda_loaded = false;
-static pthread_once_t cuda_load_once = PTHREAD_ONCE_INIT;
-
-static void cuda_do_load(void)
-{
-    void *lib = dlopen(CUDA_LIBNAME, RTLD_LAZY);
-    if (!lib) {
-        return;
-    }
-
-#define CUDA_LOAD_SYMBOL(NAME, TYPE) \
-    mpv_ ## NAME = dlsym(lib, #NAME); if (!mpv_ ## NAME) return;
-
-    CUDA_FNS(CUDA_LOAD_SYMBOL)
-
-    cuda_loaded = true;
-}
-
-bool cuda_load(void)
-{
-    pthread_once(&cuda_load_once, cuda_do_load);
-    return cuda_loaded;
-}
diff --git a/video/out/opengl/cuda_dynamic.h b/video/out/opengl/cuda_dynamic.h
deleted file mode 100644
index 9d75b31..0000000
--- a/video/out/opengl/cuda_dynamic.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * This file is part of mpv.
- *
- * It is based on an equivalent file in ffmpeg that was
- * constructed from documentation, rather than from any
- * original cuda headers.
- *
- * mpv is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * mpv is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with mpv.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef MPV_CUDA_DYNAMIC_H
-#define MPV_CUDA_DYNAMIC_H
-
-#include <stdbool.h>
-#include <stddef.h>
-
-#include "gl_headers.h"
-
-#define CUDA_VERSION 7050
-
-#if defined(_WIN32) || defined(__CYGWIN__)
-#define CUDAAPI __stdcall
-#else
-#define CUDAAPI
-#endif
-
-#define CU_CTX_SCHED_BLOCKING_SYNC 4
-
-typedef int CUdevice;
-
-typedef struct CUarray_st *CUarray;
-typedef struct CUgraphicsResource_st *CUgraphicsResource;
-typedef struct CUstream_st *CUstream;
-
-typedef void* CUcontext;
-#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
-typedef unsigned long long CUdeviceptr;
-#else
-typedef unsigned int CUdeviceptr;
-#endif
-
-typedef enum cudaError_enum {
-    CUDA_SUCCESS = 0
-} CUresult;
-
-typedef enum CUmemorytype_enum {
-    CU_MEMORYTYPE_HOST = 1,
-    CU_MEMORYTYPE_DEVICE = 2,
-    CU_MEMORYTYPE_ARRAY = 3
-} CUmemorytype;
-
-typedef struct CUDA_MEMCPY2D_st {
-    size_t srcXInBytes;
-    size_t srcY;
-    CUmemorytype srcMemoryType;
-    const void *srcHost;
-    CUdeviceptr srcDevice;
-    CUarray srcArray;
-    size_t srcPitch;
-
-    size_t dstXInBytes;
-    size_t dstY;
-    CUmemorytype dstMemoryType;
-    void *dstHost;
-    CUdeviceptr dstDevice;
-    CUarray dstArray;
-    size_t dstPitch;
-
-    size_t WidthInBytes;
-    size_t Height;
-} CUDA_MEMCPY2D;
-
-typedef enum CUGLDeviceList_enum {
-    CU_GL_DEVICE_LIST_ALL = 1,
-    CU_GL_DEVICE_LIST_CURRENT_FRAME = 2,
-    CU_GL_DEVICE_LIST_NEXT_FRAME = 3,
-} CUGLDeviceList;
-
-#define CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD 2
-
-typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
-typedef CUresult CUDAAPI tcuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
-typedef CUresult CUDAAPI tcuCtxPushCurrent_v2(CUcontext *pctx);
-typedef CUresult CUDAAPI tcuCtxPopCurrent_v2(CUcontext *pctx);
-typedef CUresult CUDAAPI tcuCtxDestroy_v2(CUcontext ctx);
-typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *pdevice, int ordinal);
-typedef CUresult CUDAAPI tcuMemcpy2D_v2(const CUDA_MEMCPY2D *pcopy);
-typedef CUresult CUDAAPI tcuGetErrorName(CUresult error, const char** pstr);
-typedef CUresult CUDAAPI tcuGetErrorString(CUresult error, const char** pstr);
-typedef CUresult CUDAAPI tcuGLGetDevices_v2(unsigned int* pCudaDeviceCount, CUdevice* pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
-typedef CUresult CUDAAPI tcuGraphicsGLRegisterImage(CUgraphicsResource* pCudaResource, GLuint image, GLenum target, unsigned int Flags);
-typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
-typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource* resources, CUstream hStream);
-typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray* pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
-
-#define CUDA_FNS(FN) \
-    FN(cuInit, tcuInit) \
-    FN(cuCtxCreate_v2, tcuCtxCreate_v2) \
-    FN(cuCtxPushCurrent_v2, tcuCtxPushCurrent_v2) \
-    FN(cuCtxPopCurrent_v2, tcuCtxPopCurrent_v2) \
-    FN(cuCtxDestroy_v2, tcuCtxDestroy_v2) \
-    FN(cuDeviceGet, tcuDeviceGet) \
-    FN(cuMemcpy2D_v2, tcuMemcpy2D_v2) \
-    FN(cuGetErrorName, tcuGetErrorName) \
-    FN(cuGetErrorString, tcuGetErrorString) \
-    FN(cuGLGetDevices_v2, tcuGLGetDevices_v2) \
-    FN(cuGraphicsGLRegisterImage, tcuGraphicsGLRegisterImage) \
-    FN(cuGraphicsUnregisterResource, tcuGraphicsUnregisterResource) \
-    FN(cuGraphicsMapResources, tcuGraphicsMapResources) \
-    FN(cuGraphicsUnmapResources, tcuGraphicsUnmapResources) \
-    FN(cuGraphicsSubResourceGetMappedArray, tcuGraphicsSubResourceGetMappedArray) \
-
-#define CUDA_EXT_DECL(NAME, TYPE) \
-    extern TYPE *mpv_ ## NAME;
-
-CUDA_FNS(CUDA_EXT_DECL)
-
-#define cuInit mpv_cuInit
-#define cuCtxCreate mpv_cuCtxCreate_v2
-#define cuCtxPushCurrent mpv_cuCtxPushCurrent_v2
-#define cuCtxPopCurrent mpv_cuCtxPopCurrent_v2
-#define cuCtxDestroy mpv_cuCtxDestroy_v2
-#define cuDeviceGet mpv_cuDeviceGet
-#define cuMemcpy2D mpv_cuMemcpy2D_v2
-#define cuGetErrorName mpv_cuGetErrorName
-#define cuGetErrorString mpv_cuGetErrorString
-#define cuGLGetDevices mpv_cuGLGetDevices_v2
-#define cuGraphicsGLRegisterImage mpv_cuGraphicsGLRegisterImage
-#define cuGraphicsUnregisterResource mpv_cuGraphicsUnregisterResource
-#define cuGraphicsMapResources mpv_cuGraphicsMapResources
-#define cuGraphicsUnmapResources mpv_cuGraphicsUnmapResources
-#define cuGraphicsSubResourceGetMappedArray mpv_cuGraphicsSubResourceGetMappedArray
-
-bool cuda_load(void);
-
-#endif // MPV_CUDA_DYNAMIC_H
diff --git a/video/out/opengl/egl_helpers.c b/video/out/opengl/egl_helpers.c
index 0033bf1..2905761 100644
--- a/video/out/opengl/egl_helpers.c
+++ b/video/out/opengl/egl_helpers.c
@@ -44,6 +44,38 @@
 #define EGL_OPENGL_ES3_BIT                      0x00000040
 #endif
 
+struct mp_egl_config_attr {
+    int attrib;
+    const char *name;
+};
+
+#define MP_EGL_ATTRIB(id) {id, # id}
+
+static const struct mp_egl_config_attr mp_egl_attribs[] = {
+    MP_EGL_ATTRIB(EGL_CONFIG_ID),
+    MP_EGL_ATTRIB(EGL_RED_SIZE),
+    MP_EGL_ATTRIB(EGL_GREEN_SIZE),
+    MP_EGL_ATTRIB(EGL_BLUE_SIZE),
+    MP_EGL_ATTRIB(EGL_ALPHA_SIZE),
+    MP_EGL_ATTRIB(EGL_COLOR_BUFFER_TYPE),
+    MP_EGL_ATTRIB(EGL_CONFIG_CAVEAT),
+    MP_EGL_ATTRIB(EGL_CONFORMANT),
+};
+
+static void dump_egl_config(struct mp_log *log, int msgl, EGLDisplay display,
+                            EGLConfig config)
+{
+    for (int n = 0; n < MP_ARRAY_SIZE(mp_egl_attribs); n++) {
+        const char *name = mp_egl_attribs[n].name;
+        EGLint v = -1;
+        if (eglGetConfigAttrib(display, config, mp_egl_attribs[n].attrib, &v)) {
+            mp_msg(log, msgl, "  %s=%d\n", name, v);
+        } else {
+            mp_msg(log, msgl, "  %s=<error>\n", name);
+        }
+    }
+}
+
 // es_version: 0 (core), 2 or 3
 static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
                            int es_version, struct mpegl_cb cb,
@@ -83,9 +115,9 @@ static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
 
     EGLint attributes[] = {
         EGL_SURFACE_TYPE, EGL_WINDOW_BIT,
-        EGL_RED_SIZE, 1,
-        EGL_GREEN_SIZE, 1,
-        EGL_BLUE_SIZE, 1,
+        EGL_RED_SIZE, 8,
+        EGL_GREEN_SIZE, 8,
+        EGL_BLUE_SIZE, 8,
         EGL_ALPHA_SIZE, ctx->opts.want_alpha ? 1 : 0,
         EGL_RENDERABLE_TYPE, rend,
         EGL_NONE
@@ -101,17 +133,28 @@ static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
 
     if (!num_configs) {
         talloc_free(configs);
-        MP_MSG(ctx, msgl, "Could not choose EGLConfig!\n");
+        MP_MSG(ctx, msgl, "Could not choose EGLConfig for %s!\n", name);
         return false;
     }
 
+    for (int n = 0; n < num_configs; n++)
+        dump_egl_config(ctx->log, MSGL_TRACE, display, configs[n]);
+
     int chosen = 0;
     if (cb.refine_config)
         chosen = cb.refine_config(cb.user_data, configs, num_configs);
+    if (chosen < 0) {
+        talloc_free(configs);
+        MP_MSG(ctx, msgl, "Could not refine EGLConfig for %s!\n", name);
+        return false;
+    }
     EGLConfig config = configs[chosen];
 
     talloc_free(configs);
 
+    MP_DBG(ctx, "Chosen EGLConfig:\n");
+    dump_egl_config(ctx->log, MSGL_DEBUG, display, config);
+
     EGLContext *egl_ctx = NULL;
 
     if (es_version) {
@@ -152,7 +195,7 @@ static bool create_context(struct ra_ctx *ctx, EGLDisplay display,
     }
 
     if (!egl_ctx) {
-        MP_MSG(ctx, msgl, "Could not create EGL context!\n");
+        MP_MSG(ctx, msgl, "Could not create EGL context for %s!\n", name);
         return false;
     }
 
@@ -199,6 +242,14 @@ bool mpegl_create_context_cb(struct ra_ctx *ctx, EGLDisplay display,
     return false;
 }
 
+static int GLAPIENTRY swap_interval(int interval)
+{
+    EGLDisplay display = eglGetCurrentDisplay();
+    if (!display)
+        return 1;
+    return !eglSwapInterval(display, interval);
+}
+
 static void *mpegl_get_proc_address(void *ctx, const char *name)
 {
     void *p = eglGetProcAddress(name);
@@ -223,4 +274,6 @@ void mpegl_load_functions(struct GL *gl, struct mp_log *log)
         egl_exts = eglQueryString(display, EGL_EXTENSIONS);
 
     mpgl_load_functions2(gl, mpegl_get_proc_address, NULL, egl_exts, log);
+    if (!gl->SwapInterval)
+        gl->SwapInterval = swap_interval;
 }
diff --git a/video/out/opengl/egl_helpers.h b/video/out/opengl/egl_helpers.h
index eaaf9d7..df489da 100644
--- a/video/out/opengl/egl_helpers.h
+++ b/video/out/opengl/egl_helpers.h
@@ -15,7 +15,9 @@ bool mpegl_create_context(struct ra_ctx *ctx, EGLDisplay display,
 
 struct mpegl_cb {
     // if set, pick the desired config from the given list and return its index
-    // defaults to 0 (they are sorted by eglChooseConfig)
+    // defaults to 0 (they are sorted by eglChooseConfig). return a negative
+    // number to indicate an error condition or that no suitable configs could
+    // be found.
     int (*refine_config)(void *user_data, EGLConfig *configs, int num_configs);
     void *user_data;
 };
diff --git a/video/out/opengl/hwdec_cuda.c b/video/out/opengl/hwdec_cuda.c
index 1a7df20..f80c145 100644
--- a/video/out/opengl/hwdec_cuda.c
+++ b/video/out/opengl/hwdec_cuda.c
@@ -27,8 +27,7 @@
  * when decoding 10bit streams (there is some hardware dithering going on).
  */
 
-#include "cuda_dynamic.h"
-
+#include <ffnvcodec/dynlink_loader.h>
 #include <libavutil/hwcontext.h>
 #include <libavutil/hwcontext_cuda.h>
 
@@ -39,6 +38,7 @@
 
 struct priv_owner {
     struct mp_hwdec_ctx hwctx;
+    CudaFunctions *cu;
     CUcontext display_ctx;
     CUcontext decode_ctx;
 };
@@ -56,13 +56,15 @@ static int check_cu(struct ra_hwdec *hw, CUresult err, const char *func)
     const char *err_name;
     const char *err_string;
 
+    struct priv_owner *p = hw->priv;
+
     MP_TRACE(hw, "Calling %s\n", func);
 
     if (err == CUDA_SUCCESS)
         return 0;
 
-    cuGetErrorName(err, &err_name);
-    cuGetErrorString(err, &err_string);
+    p->cu->cuGetErrorName(err, &err_name);
+    p->cu->cuGetErrorString(err, &err_string);
 
     MP_ERR(hw, "%s failed", func);
     if (err_name && err_string)
@@ -82,6 +84,7 @@ static int cuda_init(struct ra_hwdec *hw)
     unsigned int device_count;
     int ret = 0;
     struct priv_owner *p = hw->priv;
+    CudaFunctions *cu;
 
     if (!ra_is_gl(hw->ra))
         return -1;
@@ -92,24 +95,25 @@ static int cuda_init(struct ra_hwdec *hw)
         return -1;
     }
 
-    bool loaded = cuda_load();
-    if (!loaded) {
+    ret = cuda_load_functions(&p->cu, NULL);
+    if (ret != 0) {
         MP_VERBOSE(hw, "Failed to load CUDA symbols\n");
         return -1;
     }
+    cu = p->cu;
 
-    ret = CHECK_CU(cuInit(0));
+    ret = CHECK_CU(cu->cuInit(0));
     if (ret < 0)
         goto error;
 
     // Allocate display context
-    ret = CHECK_CU(cuGLGetDevices(&device_count, &display_dev, 1,
-                                  CU_GL_DEVICE_LIST_ALL));
+    ret = CHECK_CU(cu->cuGLGetDevices(&device_count, &display_dev, 1,
+                                      CU_GL_DEVICE_LIST_ALL));
     if (ret < 0)
         goto error;
 
-    ret = CHECK_CU(cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
-                               display_dev));
+    ret = CHECK_CU(cu->cuCtxCreate(&p->display_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                   display_dev));
     if (ret < 0)
         goto error;
 
@@ -121,7 +125,7 @@ static int cuda_init(struct ra_hwdec *hw)
 
     if (decode_dev_idx > -1) {
         CUdevice decode_dev;
-        ret = CHECK_CU(cuDeviceGet(&decode_dev, decode_dev_idx));
+        ret = CHECK_CU(cu->cuDeviceGet(&decode_dev, decode_dev_idx));
         if (ret < 0)
             goto error;
 
@@ -129,12 +133,12 @@ static int cuda_init(struct ra_hwdec *hw)
             MP_INFO(hw, "Using separate decoder and display devices\n");
 
             // Pop the display context. We won't use it again during init()
-            ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+            ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
             if (ret < 0)
                 goto error;
 
-            ret = CHECK_CU(cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
-                                       decode_dev));
+            ret = CHECK_CU(cu->cuCtxCreate(&p->decode_ctx, CU_CTX_SCHED_BLOCKING_SYNC,
+                                           decode_dev));
             if (ret < 0)
                 goto error;
         }
@@ -155,7 +159,7 @@ static int cuda_init(struct ra_hwdec *hw)
         goto error;
     }
 
-    ret = CHECK_CU(cuCtxPopCurrent(&dummy));
+    ret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     if (ret < 0)
         goto error;
 
@@ -168,7 +172,7 @@ static int cuda_init(struct ra_hwdec *hw)
 
  error:
     av_buffer_unref(&hw_device_ctx);
-    CHECK_CU(cuCtxPopCurrent(&dummy));
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 
     return -1;
 }
@@ -176,15 +180,18 @@ static int cuda_init(struct ra_hwdec *hw)
 static void cuda_uninit(struct ra_hwdec *hw)
 {
     struct priv_owner *p = hw->priv;
+    CudaFunctions *cu = p->cu;
 
     hwdec_devices_remove(hw->devs, &p->hwctx);
     av_buffer_unref(&p->hwctx.av_device_ref);
 
     if (p->decode_ctx && p->decode_ctx != p->display_ctx)
-        CHECK_CU(cuCtxDestroy(p->decode_ctx));
+        CHECK_CU(cu->cuCtxDestroy(p->decode_ctx));
 
     if (p->display_ctx)
-        CHECK_CU(cuCtxDestroy(p->display_ctx));
+        CHECK_CU(cu->cuCtxDestroy(p->display_ctx));
+
+    cuda_free_functions(&p->cu);
 }
 
 #undef CHECK_CU
@@ -195,6 +202,7 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
     struct priv_owner *p_owner = mapper->owner->priv;
     struct priv *p = mapper->priv;
     CUcontext dummy;
+    CudaFunctions *cu = p_owner->cu;
     int ret = 0, eret = 0;
 
     p->display_ctx = p_owner->display_ctx;
@@ -212,7 +220,7 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
         return -1;
     }
 
-    ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
+    ret = CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
@@ -239,27 +247,27 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
         GLenum target;
         ra_gl_get_raw_tex(mapper->ra, mapper->tex[n], &texture, &target);
 
-        ret = CHECK_CU(cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target,
-                                                 CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD));
+        ret = CHECK_CU(cu->cuGraphicsGLRegisterImage(&p->cu_res[n], texture, target,
+                                                     CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD));
         if (ret < 0)
             goto error;
 
-        ret = CHECK_CU(cuGraphicsMapResources(1, &p->cu_res[n], 0));
+        ret = CHECK_CU(cu->cuGraphicsMapResources(1, &p->cu_res[n], 0));
         if (ret < 0)
             goto error;
 
-        ret = CHECK_CU(cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n],
-                                                           0, 0));
+        ret = CHECK_CU(cu->cuGraphicsSubResourceGetMappedArray(&p->cu_array[n], p->cu_res[n],
+                                                               0, 0));
         if (ret < 0)
             goto error;
 
-        ret = CHECK_CU(cuGraphicsUnmapResources(1, &p->cu_res[n], 0));
+        ret = CHECK_CU(cu->cuGraphicsUnmapResources(1, &p->cu_res[n], 0));
         if (ret < 0)
             goto error;
     }
 
  error:
-    eret = CHECK_CU(cuCtxPopCurrent(&dummy));
+    eret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
     if (eret < 0)
         return eret;
 
@@ -269,17 +277,19 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
 static void mapper_uninit(struct ra_hwdec_mapper *mapper)
 {
     struct priv *p = mapper->priv;
+    struct priv_owner *p_owner = mapper->owner->priv;
+    CudaFunctions *cu = p_owner->cu;
     CUcontext dummy;
 
     // Don't bail if any CUDA calls fail. This is all best effort.
-    CHECK_CU(cuCtxPushCurrent(p->display_ctx));
+    CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     for (int n = 0; n < 4; n++) {
         if (p->cu_res[n] > 0)
-            CHECK_CU(cuGraphicsUnregisterResource(p->cu_res[n]));
+            CHECK_CU(cu->cuGraphicsUnregisterResource(p->cu_res[n]));
         p->cu_res[n] = 0;
         ra_tex_free(mapper->ra, &mapper->tex[n]);
     }
-    CHECK_CU(cuCtxPopCurrent(&dummy));
+    CHECK_CU(cu->cuCtxPopCurrent(&dummy));
 }
 
 static void mapper_unmap(struct ra_hwdec_mapper *mapper)
@@ -289,10 +299,12 @@ static void mapper_unmap(struct ra_hwdec_mapper *mapper)
 static int mapper_map(struct ra_hwdec_mapper *mapper)
 {
     struct priv *p = mapper->priv;
+    struct priv_owner *p_owner = mapper->owner->priv;
+    CudaFunctions *cu = p_owner->cu;
     CUcontext dummy;
     int ret = 0, eret = 0;
 
-    ret = CHECK_CU(cuCtxPushCurrent(p->display_ctx));
+    ret = CHECK_CU(cu->cuCtxPushCurrent(p->display_ctx));
     if (ret < 0)
         return ret;
 
@@ -308,14 +320,14 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
                              mapper->tex[n]->params.format->pixel_size,
             .Height        = mp_image_plane_h(&p->layout, n),
         };
-        ret = CHECK_CU(cuMemcpy2D(&cpy));
+        ret = CHECK_CU(cu->cuMemcpy2D(&cpy));
         if (ret < 0)
             goto error;
     }
 
 
  error:
-   eret = CHECK_CU(cuCtxPopCurrent(&dummy));
+   eret = CHECK_CU(cu->cuCtxPopCurrent(&dummy));
    if (eret < 0)
        return eret;
 
diff --git a/video/out/opengl/hwdec_d3d11egl.c b/video/out/opengl/hwdec_d3d11egl.c
index e741633..f9a6700 100644
--- a/video/out/opengl/hwdec_d3d11egl.c
+++ b/video/out/opengl/hwdec_d3d11egl.c
@@ -178,9 +178,12 @@ static int init(struct ra_hwdec *hw)
     ID3D10Multithread_SetMultithreadProtected(multithread, TRUE);
     ID3D10Multithread_Release(multithread);
 
+    static const int subfmts[] = {IMGFMT_NV12, 0};
     p->hwctx = (struct mp_hwdec_ctx){
         .driver_name = hw->driver->name,
         .av_device_ref = d3d11_wrap_device_ref(p->d3d11_device),
+        .supported_formats = subfmts,
+        .hw_imgfmt = IMGFMT_D3D11,
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
 
@@ -332,7 +335,7 @@ static void mapper_unmap(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_d3d11egl = {
     .name = "d3d11-egl",
     .priv_size = sizeof(struct priv_owner),
-    .imgfmts = {IMGFMT_D3D11NV12, 0},
+    .imgfmts = {IMGFMT_D3D11, 0},
     .init = init,
     .uninit = uninit,
     .mapper = &(const struct ra_hwdec_mapper_driver){
diff --git a/video/out/opengl/hwdec_d3d11eglrgb.c b/video/out/opengl/hwdec_d3d11eglrgb.c
index c8f6580..db7b1cf 100644
--- a/video/out/opengl/hwdec_d3d11eglrgb.c
+++ b/video/out/opengl/hwdec_d3d11eglrgb.c
@@ -135,9 +135,12 @@ static int init(struct ra_hwdec *hw)
         goto fail;
     }
 
+    static const int subfmts[] = {IMGFMT_RGB0, 0};
     p->hwctx = (struct mp_hwdec_ctx){
         .driver_name = hw->driver->name,
         .av_device_ref = d3d11_wrap_device_ref(p->d3d11_device),
+        .supported_formats = subfmts,
+        .hw_imgfmt = IMGFMT_D3D11,
     };
     hwdec_devices_add(hw->devs, &p->hwctx);
 
@@ -159,6 +162,11 @@ static int mapper_init(struct ra_hwdec_mapper *mapper)
     struct priv *p = mapper->priv;
     GL *gl = ra_gl_get(mapper->ra);
 
+    if (mapper->src_params.hw_subfmt != IMGFMT_RGB0) {
+        MP_FATAL(mapper, "Format not supported.\n");
+        return -1;
+    }
+
     gl->GenTextures(1, &p->gl_texture);
     gl->BindTexture(GL_TEXTURE_2D, p->gl_texture);
     gl->TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
@@ -258,7 +266,7 @@ static int mapper_map(struct ra_hwdec_mapper *mapper)
 const struct ra_hwdec_driver ra_hwdec_d3d11eglrgb = {
     .name = "d3d11-egl-rgb",
     .priv_size = sizeof(struct priv_owner),
-    .imgfmts = {IMGFMT_D3D11RGB, 0},
+    .imgfmts = {IMGFMT_D3D11, 0},
     .init = init,
     .uninit = uninit,
     .mapper = &(const struct ra_hwdec_mapper_driver){
diff --git a/video/out/opengl/hwdec_drmprime_drm.c b/video/out/opengl/hwdec_drmprime_drm.c
index faa099a..5e09c5a 100644
--- a/video/out/opengl/hwdec_drmprime_drm.c
+++ b/video/out/opengl/hwdec_drmprime_drm.c
@@ -29,14 +29,12 @@
 #include "video/hwdec.h"
 #include "common/msg.h"
 #include "options/m_config.h"
-#include "libmpv/opengl_cb.h"
+#include "libmpv/render_gl.h"
 #include "video/out/drm_common.h"
 #include "video/out/drm_prime.h"
 #include "video/out/gpu/hwdec.h"
 #include "video/mp_image.h"
 
-#include "ra_gl.h"
-
 extern const struct m_sub_options drm_conf;
 
 struct drm_frame {
@@ -50,7 +48,7 @@ struct priv {
     struct mp_image_params params;
 
     struct drm_atomic_context *ctx;
-    struct drm_frame current_frame, old_frame;
+    struct drm_frame current_frame, last_frame, old_frame;
 
     struct mp_rect src, dst;
 
@@ -71,8 +69,11 @@ static void set_current_frame(struct ra_hwdec *hw, struct drm_frame *frame)
         drm_prime_destroy_framebuffer(p->log, p->ctx->fd, &p->old_frame.fb);
     }
 
-    mp_image_setrefp(&p->old_frame.image, p->current_frame.image);
-    p->old_frame.fb = p->current_frame.fb;
+    mp_image_setrefp(&p->old_frame.image, p->last_frame.image);
+    p->old_frame.fb = p->last_frame.fb;
+
+    mp_image_setrefp(&p->last_frame.image, p->current_frame.image);
+    p->last_frame.fb = p->current_frame.fb;
 
     if (frame) {
         p->current_frame.fb = frame->fb;
@@ -86,15 +87,12 @@ static void set_current_frame(struct ra_hwdec *hw, struct drm_frame *frame)
 static void scale_dst_rect(struct ra_hwdec *hw, int source_w, int source_h ,struct mp_rect *src, struct mp_rect *dst)
 {
     struct priv *p = hw->priv;
-    double hratio, vratio, ratio;
 
     // drm can allow to have a layer that has a different size from framebuffer
     // we scale here the destination size to video mode
-    hratio = vratio = ratio = 1.0;
-
-    hratio = (double)p->display_w / (double)source_w;
-    vratio = (double)p->display_h / (double)source_h;
-    ratio = hratio <= vratio ? hratio : vratio;
+    double hratio = p->display_w / (double)source_w;
+    double vratio = p->display_h / (double)source_h;
+    double ratio = hratio <= vratio ? hratio : vratio;
 
     dst->x0 = src->x0 * ratio;
     dst->x1 = src->x1 * ratio;
@@ -110,37 +108,67 @@ static void scale_dst_rect(struct ra_hwdec *hw, int source_w, int source_h ,stru
     dst->y1 += offset_y;
 }
 
+static void disable_video_plane(struct ra_hwdec *hw)
+{
+    struct priv *p = hw->priv;
+    if (!p->ctx)
+        return;
+
+    // Disabling video plane is needed on some devices when using the
+    // primary plane for video. Primary buffer can't be active with no
+    // framebuffer associated. So we need this function to commit it
+    // right away as mpv will free all framebuffers on playback end.
+    drmModeAtomicReqPtr request = drmModeAtomicAlloc();
+    if (request) {
+        drm_object_set_property(request, p->ctx->video_plane, "FB_ID", 0);
+        drm_object_set_property(request, p->ctx->video_plane, "CRTC_ID", 0);
+
+        int ret = drmModeAtomicCommit(p->ctx->fd, request,
+                                  DRM_MODE_ATOMIC_NONBLOCK, NULL);
+
+        if (ret)
+            MP_ERR(hw, "Failed to commit disable plane request (code %d)", ret);
+        drmModeAtomicFree(request);
+    }
+}
+
 static int overlay_frame(struct ra_hwdec *hw, struct mp_image *hw_image,
                          struct mp_rect *src, struct mp_rect *dst, bool newframe)
 {
     struct priv *p = hw->priv;
-    GL *gl = ra_gl_get(hw->ra);
     AVDRMFrameDescriptor *desc = NULL;
     drmModeAtomicReq *request = NULL;
     struct drm_frame next_frame = {0};
     int ret;
 
+    // grab atomic request from native resources
+    if (p->ctx) {
+        struct mpv_opengl_drm_params *drm_params;
+        drm_params = (mpv_opengl_drm_params *)ra_get_native_resource(hw->ra, "drm_params");
+        if (!drm_params) {
+            MP_ERR(hw, "Failed to retrieve drm params from native resources\n");
+            return -1;
+        }
+        if (drm_params->atomic_request_ptr) {
+            request = *drm_params->atomic_request_ptr;
+        } else {
+            MP_ERR(hw, "drm params pointer to atomic request is invalid");
+            return -1;
+        }
+    }
+
     if (hw_image) {
 
-        // grab opengl-cb windowing info to eventually upscale the overlay
-        // as egl windows could be upscaled to primary plane.
-        struct mpv_opengl_cb_window_pos *glparams =
-                gl ? (struct mpv_opengl_cb_window_pos *)
-                mpgl_get_native_display(gl, "opengl-cb-window-pos") : NULL;
-        if (glparams) {
-            scale_dst_rect(hw, glparams->width, glparams->height, dst, &p->dst);
+        // grab osd windowing info to eventually upscale the overlay
+        // as egl windows could be upscaled to osd plane.
+        struct mpv_opengl_drm_osd_size *osd_size = ra_get_native_resource(hw->ra, "drm_osd_size");
+        if (osd_size) {
+            scale_dst_rect(hw, osd_size->width, osd_size->height, dst, &p->dst);
         } else {
             p->dst = *dst;
         }
         p->src = *src;
 
-        // grab drm interop info
-        struct mpv_opengl_cb_drm_params *drmparams =
-                gl ? (struct mpv_opengl_cb_drm_params *)
-                mpgl_get_native_display(gl, "opengl-cb-drm-params") : NULL;
-        if (drmparams)
-            request = (drmModeAtomicReq *)drmparams->atomic_request;
-
         next_frame.image = hw_image;
         desc = (AVDRMFrameDescriptor *)hw_image->planes[0];
 
@@ -156,28 +184,33 @@ static int overlay_frame(struct ra_hwdec *hw, struct mp_image *hw_image,
             }
 
             if (request) {
-                drm_object_set_property(request, p->ctx->overlay_plane, "FB_ID", next_frame.fb.fb_id);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_ID", p->ctx->crtc->id);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_X",   p->src.x0 << 16);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_Y",   p->src.y0 << 16);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_W",   srcw << 16);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "SRC_H",   srch << 16);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_X",  MP_ALIGN_DOWN(p->dst.x0, 2));
-                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_Y",  MP_ALIGN_DOWN(p->dst.y0, 2));
-                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_W",  dstw);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "CRTC_H",  dsth);
-                drm_object_set_property(request,  p->ctx->overlay_plane, "ZPOS",    0);
+                drm_object_set_property(request, p->ctx->video_plane, "FB_ID", next_frame.fb.fb_id);
+                drm_object_set_property(request,  p->ctx->video_plane, "CRTC_ID", p->ctx->crtc->id);
+                drm_object_set_property(request,  p->ctx->video_plane, "SRC_X",   p->src.x0 << 16);
+                drm_object_set_property(request,  p->ctx->video_plane, "SRC_Y",   p->src.y0 << 16);
+                drm_object_set_property(request,  p->ctx->video_plane, "SRC_W",   srcw << 16);
+                drm_object_set_property(request,  p->ctx->video_plane, "SRC_H",   srch << 16);
+                drm_object_set_property(request,  p->ctx->video_plane, "CRTC_X",  MP_ALIGN_DOWN(p->dst.x0, 2));
+                drm_object_set_property(request,  p->ctx->video_plane, "CRTC_Y",  MP_ALIGN_DOWN(p->dst.y0, 2));
+                drm_object_set_property(request,  p->ctx->video_plane, "CRTC_W",  dstw);
+                drm_object_set_property(request,  p->ctx->video_plane, "CRTC_H",  dsth);
+                drm_object_set_property(request,  p->ctx->video_plane, "ZPOS",    0);
             } else {
-                ret = drmModeSetPlane(p->ctx->fd, p->ctx->overlay_plane->id, p->ctx->crtc->id, next_frame.fb.fb_id, 0,
+                ret = drmModeSetPlane(p->ctx->fd, p->ctx->video_plane->id, p->ctx->crtc->id, next_frame.fb.fb_id, 0,
                                       MP_ALIGN_DOWN(p->dst.x0, 2), MP_ALIGN_DOWN(p->dst.y0, 2), dstw, dsth,
                                       p->src.x0 << 16, p->src.y0 << 16 , srcw << 16, srch << 16);
                 if (ret < 0) {
-                    MP_ERR(hw, "Failed to set the plane %d (buffer %d).\n", p->ctx->overlay_plane->id,
+                    MP_ERR(hw, "Failed to set the plane %d (buffer %d).\n", p->ctx->video_plane->id,
                                 next_frame.fb.fb_id);
                     goto fail;
                 }
             }
         }
+    } else {
+        disable_video_plane(hw);
+
+        while (p->old_frame.fb.fb_id)
+          set_current_frame(hw, NULL);
     }
 
     set_current_frame(hw, &next_frame);
@@ -192,6 +225,7 @@ static void uninit(struct ra_hwdec *hw)
 {
     struct priv *p = hw->priv;
 
+    disable_video_plane(hw);
     set_current_frame(hw, NULL);
 
     if (p->ctx) {
@@ -203,36 +237,28 @@ static void uninit(struct ra_hwdec *hw)
 static int init(struct ra_hwdec *hw)
 {
     struct priv *p = hw->priv;
-    int drm_overlay;
-
-    if (!ra_is_gl(hw->ra))
-        return -1;
+    int osd_plane_id, video_plane_id;
 
     p->log = hw->log;
 
     void *tmp = talloc_new(NULL);
     struct drm_opts *opts = mp_get_config_group(tmp, hw->global, &drm_conf);
-    drm_overlay = opts->drm_overlay_id;
+    osd_plane_id = opts->drm_osd_plane_id;
+    video_plane_id = opts->drm_video_plane_id;
     talloc_free(tmp);
 
-    GL *gl = ra_gl_get(hw->ra);
-    struct mpv_opengl_cb_drm_params *params =
-            gl ? (struct mpv_opengl_cb_drm_params *)
-            mpgl_get_native_display(gl, "opengl-cb-drm-params") : NULL;
-    if (!params) {
-        MP_VERBOSE(hw, "Could not get drm interop info.\n");
-        goto err;
-    }
+    struct mpv_opengl_drm_params *drm_params;
 
-    if (params->fd) {
-        p->ctx = drm_atomic_create_context(p->log, params->fd, params->crtc_id,
-                                           drm_overlay);
+    drm_params = ra_get_native_resource(hw->ra, "drm_params");
+    if (drm_params) {
+        p->ctx = drm_atomic_create_context(p->log, drm_params->fd, drm_params->crtc_id,
+                                           drm_params->connector_id, osd_plane_id, video_plane_id);
         if (!p->ctx) {
             mp_err(p->log, "Failed to retrieve DRM atomic context.\n");
             goto err;
         }
     } else {
-        mp_err(p->log, "Failed to retrieve DRM fd from native display.\n");
+        mp_verbose(p->log, "Failed to retrieve DRM fd from native display.\n");
         goto err;
     }
 
@@ -244,13 +270,13 @@ static int init(struct ra_hwdec *hw)
         drmModeFreeCrtc(crtc);
     }
 
-
     uint64_t has_prime;
     if (drmGetCap(p->ctx->fd, DRM_CAP_PRIME, &has_prime) < 0) {
         MP_ERR(hw, "Card does not support prime handles.\n");
         goto err;
     }
 
+    disable_video_plane(hw);
     return 0;
 
 err:
diff --git a/video/out/opengl/hwdec_dxva2gldx.c b/video/out/opengl/hwdec_dxva2gldx.c
index 984fd7f..bbf76b0 100644
--- a/video/out/opengl/hwdec_dxva2gldx.c
+++ b/video/out/opengl/hwdec_dxva2gldx.c
@@ -67,12 +67,12 @@ static int init(struct ra_hwdec *hw)
 
     // AMD drivers won't open multiple dxinterop HANDLES on the same D3D device,
     // so we request the one already in use by context_dxinterop
-    p->device_h = mpgl_get_native_display(gl, "dxinterop_device_HANDLE");
+    p->device_h = ra_get_native_resource(hw->ra, "dxinterop_device_HANDLE");
     if (!p->device_h)
         return -1;
 
     // But we also still need the actual D3D device
-    p->device = mpgl_get_native_display(gl, "IDirect3DDevice9Ex");
+    p->device = ra_get_native_resource(hw->ra, "IDirect3DDevice9Ex");
     if (!p->device)
         return -1;
     IDirect3DDevice9Ex_AddRef(p->device);
diff --git a/video/out/opengl/hwdec_ios.m b/video/out/opengl/hwdec_ios.m
index b8d4876..a16a09f 100644
--- a/video/out/opengl/hwdec_ios.m
+++ b/video/out/opengl/hwdec_ios.m
@@ -253,8 +253,10 @@ static void mapper_uninit(struct ra_hwdec_mapper *mapper)
     struct priv *p = mapper->priv;
 
     CVPixelBufferRelease(p->pbuf);
-    CFRelease(p->gl_texture_cache);
-    p->gl_texture_cache = NULL;
+    if (p->gl_texture_cache) {
+        CFRelease(p->gl_texture_cache);
+        p->gl_texture_cache = NULL;
+    }
 }
 
 const struct ra_hwdec_driver ra_hwdec_videotoolbox = {
diff --git a/video/out/opengl/hwdec_rpi.c b/video/out/opengl/hwdec_rpi.c
index 6c080f1..045fa75 100644
--- a/video/out/opengl/hwdec_rpi.c
+++ b/video/out/opengl/hwdec_rpi.c
@@ -36,7 +36,6 @@
 #include "video/out/gpu/hwdec.h"
 
 #include "common.h"
-#include "ra_gl.h"
 
 struct priv {
     struct mp_log *log;
@@ -126,13 +125,12 @@ static void disable_renderer(struct ra_hwdec *hw)
 static void update_overlay(struct ra_hwdec *hw, bool check_window_only)
 {
     struct priv *p = hw->priv;
-    GL *gl = ra_is_gl(hw->ra) ? ra_gl_get(hw->ra) : NULL;
     MMAL_PORT_T *input = p->renderer->input[0];
     struct mp_rect src = p->src;
     struct mp_rect dst = p->dst;
 
     int defs[4] = {0, 0, 0, 0};
-    int *z = gl ? mpgl_get_native_display(gl, "MPV_RPI_WINDOW") : NULL;
+    int *z = ra_get_native_resource(hw->ra, "MPV_RPI_WINDOW");
     if (!z)
         z = defs;
 
diff --git a/video/out/opengl/hwdec_vaegl.c b/video/out/opengl/hwdec_vaegl.c
index b4587c5..2ff0d98 100644
--- a/video/out/opengl/hwdec_vaegl.c
+++ b/video/out/opengl/hwdec_vaegl.c
@@ -36,6 +36,7 @@
 #include "video/vaapi.h"
 #include "common.h"
 #include "ra_gl.h"
+#include "libmpv/render_gl.h"
 
 #ifndef GL_OES_EGL_image
 typedef void* GLeglImageOES;
@@ -55,9 +56,9 @@ typedef void *EGLImageKHR;
 #if HAVE_VAAPI_X11
 #include <va/va_x11.h>
 
-static VADisplay *create_x11_va_display(GL *gl)
+static VADisplay *create_x11_va_display(struct ra *ra)
 {
-    Display *x11 = mpgl_get_native_display(gl, "x11");
+    Display *x11 = ra_get_native_resource(ra, "x11");
     return x11 ? vaGetDisplay(x11) : NULL;
 }
 #endif
@@ -65,9 +66,9 @@ static VADisplay *create_x11_va_display(GL *gl)
 #if HAVE_VAAPI_WAYLAND
 #include <va/va_wayland.h>
 
-static VADisplay *create_wayland_va_display(GL *gl)
+static VADisplay *create_wayland_va_display(struct ra *ra)
 {
-    struct wl_display *wl = mpgl_get_native_display(gl, "wl");
+    struct wl_display *wl = ra_get_native_resource(ra, "wl");
     return wl ? vaGetDisplayWl(wl) : NULL;
 }
 #endif
@@ -75,19 +76,19 @@ static VADisplay *create_wayland_va_display(GL *gl)
 #if HAVE_VAAPI_DRM
 #include <va/va_drm.h>
 
-static VADisplay *create_drm_va_display(GL *gl)
+static VADisplay *create_drm_va_display(struct ra *ra)
 {
-    int drm_fd = (intptr_t)mpgl_get_native_display(gl, "drm");
-    // Note: yes, drm_fd==0 could be valid - but it's rare and doesn't fit with
-    //       our slightly crappy way of passing it through, so consider 0 not
-    //       valid.
-    return drm_fd ? vaGetDisplayDRM(drm_fd) : NULL;
+    mpv_opengl_drm_params *params = ra_get_native_resource(ra, "drm_params");
+    if (!params || params->render_fd < 0)
+        return NULL;
+
+    return vaGetDisplayDRM(params->render_fd);
 }
 #endif
 
 struct va_create_native {
     const char *name;
-    VADisplay *(*create)(GL *gl);
+    VADisplay *(*create)(struct ra *ra);
 };
 
 static const struct va_create_native create_native_cbs[] = {
@@ -102,12 +103,12 @@ static const struct va_create_native create_native_cbs[] = {
 #endif
 };
 
-static VADisplay *create_native_va_display(GL *gl, struct mp_log *log)
+static VADisplay *create_native_va_display(struct ra *ra, struct mp_log *log)
 {
     for (int n = 0; n < MP_ARRAY_SIZE(create_native_cbs); n++) {
         const struct va_create_native *disp = &create_native_cbs[n];
         mp_verbose(log, "Trying to open a %s VA display...\n", disp->name);
-        VADisplay *display = disp->create(gl);
+        VADisplay *display = disp->create(ra);
         if (display)
             return display;
     }
@@ -169,7 +170,7 @@ static int init(struct ra_hwdec *hw)
         !(gl->mpgl_caps & MPGL_CAP_TEX_RG))
         return -1;
 
-    p->display = create_native_va_display(gl, hw->log);
+    p->display = create_native_va_display(hw->ra, hw->log);
     if (!p->display) {
         MP_VERBOSE(hw, "Could not create a VA display.\n");
         return -1;
diff --git a/video/out/opengl/libmpv_gl.c b/video/out/opengl/libmpv_gl.c
new file mode 100644
index 0000000..ae6ec66
--- /dev/null
+++ b/video/out/opengl/libmpv_gl.c
@@ -0,0 +1,127 @@
+#include "common.h"
+#include "context.h"
+#include "ra_gl.h"
+#include "options/m_config.h"
+#include "libmpv/render_gl.h"
+#include "video/out/gpu/libmpv_gpu.h"
+#include "video/out/gpu/ra.h"
+
+struct priv {
+    GL *gl;
+    struct ra_ctx *ra_ctx;
+};
+
+static int init(struct libmpv_gpu_context *ctx, mpv_render_param *params)
+{
+    ctx->priv = talloc_zero(NULL, struct priv);
+    struct priv *p = ctx->priv;
+
+    mpv_opengl_init_params *init_params =
+        get_mpv_render_param(params, MPV_RENDER_PARAM_OPENGL_INIT_PARAMS, NULL);
+    if (!init_params)
+        return MPV_ERROR_INVALID_PARAMETER;
+
+    p->gl = talloc_zero(p, GL);
+
+    mpgl_load_functions2(p->gl, init_params->get_proc_address,
+                         init_params->get_proc_address_ctx,
+                         init_params->extra_exts, ctx->log);
+    if (!p->gl->version && !p->gl->es) {
+        MP_FATAL(ctx, "OpenGL not initialized.\n");
+        return MPV_ERROR_UNSUPPORTED;
+    }
+
+    // initialize a blank ra_ctx to reuse ra_gl_ctx
+    p->ra_ctx = talloc_zero(p, struct ra_ctx);
+    p->ra_ctx->log = ctx->log;
+    p->ra_ctx->global = ctx->global;
+    p->ra_ctx->opts = (struct ra_ctx_opts) {
+        .probing = false,
+        .allow_sw = true,
+    };
+
+    static const struct ra_swapchain_fns empty_swapchain_fns = {0};
+    struct ra_gl_ctx_params gl_params = {
+        // vo_opengl_cb is essentially like a gigantic external swapchain where
+        // the user is in charge of presentation / swapping etc. But we don't
+        // actually need to provide any of these functions, since we can just
+        // not call them to begin with - so just set it to an empty object to
+        // signal to ra_gl_p that we don't care about its latency emulation
+        // functionality
+        .external_swapchain = &empty_swapchain_fns
+    };
+
+    p->gl->SwapInterval = NULL; // we shouldn't randomly change this, so lock it
+    if (!ra_gl_ctx_init(p->ra_ctx, p->gl, gl_params))
+        return MPV_ERROR_UNSUPPORTED;
+
+    int debug;
+    mp_read_option_raw(ctx->global, "gpu-debug", &m_option_type_flag, &debug);
+    p->ra_ctx->opts.debug = debug;
+    p->gl->debug_context = debug;
+    ra_gl_set_debug(p->ra_ctx->ra, debug);
+
+    ctx->ra = p->ra_ctx->ra;
+
+    // Legacy API user loading for opengl-cb. Explicitly inactive for render API.
+    if (get_mpv_render_param(params, (mpv_render_param_type)-1, NULL) ==
+        ctx->global && p->gl->MPGetNativeDisplay)
+    {
+        void *x11 = p->gl->MPGetNativeDisplay("x11");
+        if (x11)
+            ra_add_native_resource(ctx->ra, "x11", x11);
+        void *wl = p->gl->MPGetNativeDisplay("wl");
+        if (wl)
+            ra_add_native_resource(ctx->ra, "wl", wl);
+    }
+
+    return 0;
+}
+
+static int wrap_fbo(struct libmpv_gpu_context *ctx, mpv_render_param *params,
+                    struct ra_tex **out)
+{
+    struct priv *p = ctx->priv;
+
+    mpv_opengl_fbo *fbo =
+        get_mpv_render_param(params, MPV_RENDER_PARAM_OPENGL_FBO, NULL);
+    if (!fbo)
+        return MPV_ERROR_INVALID_PARAMETER;
+
+    if (fbo->fbo && !(p->gl->mpgl_caps & MPGL_CAP_FB)) {
+        MP_FATAL(ctx, "Rendering to FBO requested, but no FBO extension found!\n");
+        return MPV_ERROR_UNSUPPORTED;
+    }
+
+    struct ra_swapchain *sw = p->ra_ctx->swapchain;
+    struct ra_fbo target;
+    ra_gl_ctx_resize(sw, fbo->w, fbo->h, fbo->fbo);
+    ra_gl_ctx_start_frame(sw, &target);
+    *out = target.tex;
+    return 0;
+}
+
+static void done_frame(struct libmpv_gpu_context *ctx, bool ds)
+{
+    struct priv *p = ctx->priv;
+
+    struct ra_swapchain *sw = p->ra_ctx->swapchain;
+    struct vo_frame dummy = {.display_synced = ds};
+    ra_gl_ctx_submit_frame(sw, &dummy);
+}
+
+static void destroy(struct libmpv_gpu_context *ctx)
+{
+    struct priv *p = ctx->priv;
+
+    if (p->ra_ctx)
+        ra_gl_ctx_uninit(p->ra_ctx);
+}
+
+const struct libmpv_gpu_context_fns libmpv_gpu_context_gl = {
+    .api_name = MPV_RENDER_API_TYPE_OPENGL,
+    .init = init,
+    .wrap_fbo = wrap_fbo,
+    .done_frame = done_frame,
+    .destroy = destroy,
+};
diff --git a/video/out/opengl/ra_gl.c b/video/out/opengl/ra_gl.c
index 5b03368..7112464 100644
--- a/video/out/opengl/ra_gl.c
+++ b/video/out/opengl/ra_gl.c
@@ -101,6 +101,7 @@ static int ra_init_gl(struct ra *ra, GL *gl)
         {RA_CAP_TEX_1D,             MPGL_CAP_1D_TEX},
         {RA_CAP_TEX_3D,             MPGL_CAP_3D_TEX},
         {RA_CAP_COMPUTE,            MPGL_CAP_COMPUTE_SHADER},
+        {RA_CAP_NUM_GROUPS,         MPGL_CAP_COMPUTE_SHADER},
         {RA_CAP_NESTED_ARRAY,       MPGL_CAP_NESTED_ARRAY},
     };
 
@@ -276,6 +277,13 @@ static struct ra_tex *gl_tex_create_blank(struct ra *ra,
         tex_gl->target = GL_TEXTURE_EXTERNAL_OES;
     }
 
+    if (params->downloadable && !(params->dimensions == 2 &&
+                                  params->format->renderable))
+    {
+        gl_tex_destroy(ra, tex);
+        return NULL;
+    }
+
     return tex;
 }
 
@@ -283,6 +291,8 @@ static struct ra_tex *gl_tex_create(struct ra *ra,
                                     const struct ra_tex_params *params)
 {
     GL *gl = ra_gl_get(ra);
+    assert(!params->format->dummy_format);
+
     struct ra_tex *tex = gl_tex_create_blank(ra, params);
     if (!tex)
         return NULL;
@@ -326,8 +336,11 @@ static struct ra_tex *gl_tex_create(struct ra *ra,
 
     gl_check_error(gl, ra->log, "after creating texture");
 
-    // Even blitting needs an FBO in OpenGL for strange reasons
-    if (tex->params.render_dst || tex->params.blit_src || tex->params.blit_dst) {
+    // Even blitting needs an FBO in OpenGL for strange reasons.
+    // Download is handled by reading from an FBO.
+    if (tex->params.render_dst || tex->params.blit_src ||
+        tex->params.blit_dst || tex->params.downloadable)
+    {
         if (!tex->params.format->renderable) {
             MP_ERR(ra, "Trying to create renderable texture with unsupported "
                    "format.\n");
@@ -382,6 +395,7 @@ static const struct ra_format fbo_dummy_format = {
         .flags = F_CR,
     },
     .renderable = true,
+    .dummy_format = true,
 };
 
 // Create a ra_tex that merely wraps an existing framebuffer. gl_fbo can be 0
@@ -508,6 +522,18 @@ static bool gl_tex_upload(struct ra *ra,
     return true;
 }
 
+static bool gl_tex_download(struct ra *ra, struct ra_tex_download_params *params)
+{
+    GL *gl = ra_gl_get(ra);
+    struct ra_tex *tex = params->tex;
+    struct ra_tex_gl *tex_gl = tex->priv;
+    if (!tex_gl->fbo)
+        return false;
+    return gl_read_fbo_contents(gl, tex_gl->fbo, 1, tex_gl->format, tex_gl->type,
+                                tex->params.w, tex->params.h, params->dst,
+                                params->stride);
+}
+
 static void gl_buf_destroy(struct ra *ra, struct ra_buf *buf)
 {
     if (!buf)
@@ -996,6 +1022,10 @@ static void gl_renderpass_run(struct ra *ra,
         assert(params->target->params.render_dst);
         assert(params->target->params.format == pass->params.target_format);
         gl->BindFramebuffer(GL_FRAMEBUFFER, target_gl->fbo);
+        if (pass->params.invalidate_target && gl->InvalidateFramebuffer) {
+            GLenum fb = target_gl->fbo ? GL_COLOR_ATTACHMENT0 : GL_COLOR;
+            gl->InvalidateFramebuffer(GL_FRAMEBUFFER, 1, &fb);
+        }
         gl->Viewport(params->viewport.x0, params->viewport.y0,
                      mp_rect_w(params->viewport),
                      mp_rect_h(params->viewport));
@@ -1126,6 +1156,7 @@ static struct ra_fns ra_fns_gl = {
     .tex_create             = gl_tex_create,
     .tex_destroy            = gl_tex_destroy,
     .tex_upload             = gl_tex_upload,
+    .tex_download           = gl_tex_download,
     .buf_create             = gl_buf_create,
     .buf_destroy            = gl_buf_destroy,
     .buf_update             = gl_buf_update,
diff --git a/video/out/opengl/utils.c b/video/out/opengl/utils.c
index 34f4736..a551ce4 100644
--- a/video/out/opengl/utils.c
+++ b/video/out/opengl/utils.c
@@ -105,25 +105,23 @@ void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
     gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
 }
 
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h)
+bool gl_read_fbo_contents(GL *gl, int fbo, int dir, GLenum format, GLenum type,
+                          int w, int h, uint8_t *dst, int dst_stride)
 {
-    if (gl->es)
-        return NULL; // ES can't read from front buffer
-    mp_image_t *image = mp_image_alloc(IMGFMT_RGB24, w, h);
-    if (!image)
-        return NULL;
+    assert(dir == 1 || dir == -1);
+    if (fbo == 0 && gl->es)
+        return false; // ES can't read from front buffer
     gl->BindFramebuffer(GL_FRAMEBUFFER, fbo);
     GLenum obj = fbo ? GL_COLOR_ATTACHMENT0 : GL_FRONT;
     gl->PixelStorei(GL_PACK_ALIGNMENT, 1);
     gl->ReadBuffer(obj);
-    //flip image while reading (and also avoid stride-related trouble)
-    for (int y = 0; y < h; y++) {
-        gl->ReadPixels(0, h - y - 1, w, 1, GL_RGB, GL_UNSIGNED_BYTE,
-                       image->planes[0] + y * image->stride[0]);
-    }
+    // reading by line allows flipping, and avoids stride-related trouble
+    int y1 = dir > 0 ? 0 : h;
+    for (int y = 0; y < h; y++)
+        gl->ReadPixels(0, y, w, 1, format, type, dst + (y1 + dir * y) * dst_stride);
     gl->PixelStorei(GL_PACK_ALIGNMENT, 4);
     gl->BindFramebuffer(GL_FRAMEBUFFER, 0);
-    return image;
+    return true;
 }
 
 static void gl_vao_enable_attribs(struct gl_vao *vao)
diff --git a/video/out/opengl/utils.h b/video/out/opengl/utils.h
index 53127e4..9bcadae 100644
--- a/video/out/opengl/utils.h
+++ b/video/out/opengl/utils.h
@@ -32,7 +32,8 @@ void gl_upload_tex(GL *gl, GLenum target, GLenum format, GLenum type,
                    const void *dataptr, int stride,
                    int x, int y, int w, int h);
 
-mp_image_t *gl_read_fbo_contents(GL *gl, int fbo, int w, int h);
+bool gl_read_fbo_contents(GL *gl, int fbo, int dir, GLenum format, GLenum type,
+                          int w, int h, uint8_t *dst, int dst_stride);
 
 struct gl_vao {
     GL *gl;
diff --git a/video/out/vo.c b/video/out/vo.c
index 63f5b34..9ecfd76 100644
--- a/video/out/vo.c
+++ b/video/out/vo.c
@@ -23,8 +23,6 @@
 #include <pthread.h>
 #include <math.h>
 
-#include <libavutil/buffer.h>
-
 #include "mpv_talloc.h"
 
 #include "config.h"
@@ -37,6 +35,7 @@
 #include "misc/bstr.h"
 #include "vo.h"
 #include "aspect.h"
+#include "dr_helper.h"
 #include "input/input.h"
 #include "options/m_config.h"
 #include "common/msg.h"
@@ -52,7 +51,7 @@ extern const struct vo_driver video_out_x11;
 extern const struct vo_driver video_out_vdpau;
 extern const struct vo_driver video_out_xv;
 extern const struct vo_driver video_out_gpu;
-extern const struct vo_driver video_out_opengl_cb;
+extern const struct vo_driver video_out_libmpv;
 extern const struct vo_driver video_out_null;
 extern const struct vo_driver video_out_image;
 extern const struct vo_driver video_out_lavc;
@@ -66,6 +65,7 @@ extern const struct vo_driver video_out_tct;
 
 const struct vo_driver *const video_out_drivers[] =
 {
+    &video_out_libmpv,
 #if HAVE_ANDROID
     &video_out_mediacodec_embed,
 #endif
@@ -101,20 +101,14 @@ const struct vo_driver *const video_out_drivers[] =
 #if HAVE_DRM
     &video_out_drm,
 #endif
-#if HAVE_ENCODING
     &video_out_lavc,
-#endif
-#if HAVE_GL
-    &video_out_opengl_cb,
-#endif
     NULL
 };
 
 struct vo_internal {
     pthread_t thread;
     struct mp_dispatch_queue *dispatch;
-
-    atomic_ullong dr_in_flight;
+    struct dr_helper *dr_helper;
 
     // --- The following fields are protected by lock
     pthread_mutex_t lock;
@@ -149,6 +143,7 @@ struct vo_internal {
     int64_t num_successive_vsyncs;
 
     int64_t flip_queue_offset; // queue flip events at most this much in advance
+    int64_t timing_offset;     // same (but from options; not VO configured)
 
     int64_t delayed_count;
     int64_t drop_count;
@@ -164,7 +159,7 @@ struct vo_internal {
     uint64_t current_frame_id;
 
     double display_fps;
-    int opt_framedrop;
+    double reported_display_fps;
 };
 
 extern const struct m_sub_options gl_video_conf;
@@ -185,7 +180,7 @@ static bool get_desc(struct m_obj_desc *dst, int index)
         .options = vo->options,
         .options_prefix = vo->options_prefix,
         .global_opts = vo->global_opts,
-        .hidden = vo->encode || !strcmp(vo->name, "opengl-cb"),
+        .hidden = vo->encode,
         .p = vo,
     };
     return true;
@@ -199,6 +194,7 @@ const struct m_obj_list vo_obj_list = {
         {"gl", "gpu"},
         {"direct3d_shaders", "direct3d"},
         {"opengl", "gpu"},
+        {"opengl-cb", "libmpv"},
         {0}
     },
     .allow_unknown_entries = true,
@@ -213,18 +209,29 @@ static void dispatch_wakeup_cb(void *ptr)
     vo_wakeup(vo);
 }
 
+// Initialize or update options from vo->opts
+static void read_opts(struct vo *vo)
+{
+    struct vo_internal *in = vo->in;
+
+    pthread_mutex_lock(&in->lock);
+    in->timing_offset = (uint64_t)(vo->opts->timing_offset * 1e6);
+    pthread_mutex_unlock(&in->lock);
+}
+
 static void update_opts(void *p)
 {
     struct vo *vo = p;
 
     if (m_config_cache_update(vo->opts_cache)) {
+        read_opts(vo);
+
         // "Legacy" update of video position related options.
         if (vo->driver->control)
             vo->driver->control(vo, VOCTRL_SET_PANSCAN, NULL);
     }
 
-    if (vo->gl_opts_cache && m_config_cache_update(vo->gl_opts_cache))
-    {
+    if (vo->gl_opts_cache && m_config_cache_update(vo->gl_opts_cache)) {
         // "Legacy" update of video GL renderer related options.
         if (vo->driver->control)
             vo->driver->control(vo, VOCTRL_UPDATE_RENDER_OPTS, NULL);
@@ -519,30 +526,30 @@ static void update_display_fps(struct vo *vo)
 
         pthread_mutex_unlock(&in->lock);
 
-        mp_read_option_raw(vo->global, "framedrop", &m_option_type_choice,
-                           &in->opt_framedrop);
-
-        double display_fps;
-        mp_read_option_raw(vo->global, "display-fps", &m_option_type_double,
-                           &display_fps);
-
-        if (display_fps <= 0)
-            vo->driver->control(vo, VOCTRL_GET_DISPLAY_FPS, &display_fps);
+        double fps = 0;
+        vo->driver->control(vo, VOCTRL_GET_DISPLAY_FPS, &fps);
 
         pthread_mutex_lock(&in->lock);
 
-        if (in->display_fps != display_fps) {
-            in->display_fps = display_fps;
-            MP_VERBOSE(vo, "Assuming %f FPS for display sync.\n", display_fps);
+        in->reported_display_fps = fps;
+    }
 
-            // make sure to update the player
-            in->queued_events |= VO_EVENT_WIN_STATE;
-            wakeup_core(vo);
-        }
+    double display_fps = vo->opts->override_display_fps;
+    if (display_fps <= 0)
+        display_fps = in->reported_display_fps;
 
-        in->nominal_vsync_interval = in->display_fps > 0 ? 1e6 / in->display_fps : 0;
+    if (in->display_fps != display_fps) {
+        in->nominal_vsync_interval =  display_fps > 0 ? 1e6 / display_fps : 0;
         in->vsync_interval = MPMAX(in->nominal_vsync_interval, 1);
+        in->display_fps = display_fps;
+
+        MP_VERBOSE(vo, "Assuming %f FPS for display sync.\n", display_fps);
+
+        // make sure to update the player
+        in->queued_events |= VO_EVENT_WIN_STATE;
+        wakeup_core(vo);
     }
+
     pthread_mutex_unlock(&in->lock);
 }
 
@@ -562,19 +569,27 @@ static void run_reconfig(void *p)
 {
     void **pp = p;
     struct vo *vo = pp[0];
-    struct mp_image_params *params = pp[1];
+    struct mp_image *img = pp[1];
     int *ret = pp[2];
 
+    struct mp_image_params *params = &img->params;
+
     struct vo_internal *in = vo->in;
 
+    MP_VERBOSE(vo, "reconfig to %s\n", mp_image_params_to_str(params));
+
     m_config_cache_update(vo->opts_cache);
 
     mp_image_params_get_dsize(params, &vo->dwidth, &vo->dheight);
 
     talloc_free(vo->params);
-    vo->params = talloc_memdup(vo, params, sizeof(*params));
+    vo->params = talloc_dup(vo, params);
 
-    *ret = vo->driver->reconfig(vo, vo->params);
+    if (vo->driver->reconfig2) {
+        *ret = vo->driver->reconfig2(vo, img);
+    } else {
+        *ret = vo->driver->reconfig(vo, vo->params);
+    }
     vo->config_ok = *ret >= 0;
     if (vo->config_ok) {
         check_vo_caps(vo);
@@ -596,7 +611,17 @@ static void run_reconfig(void *p)
 int vo_reconfig(struct vo *vo, struct mp_image_params *params)
 {
     int ret;
-    void *p[] = {vo, params, &ret};
+    struct mp_image dummy = {0};
+    mp_image_set_params(&dummy, params);
+    void *p[] = {vo, &dummy, &ret};
+    mp_dispatch_run(vo->in->dispatch, run_reconfig, p);
+    return ret;
+}
+
+int vo_reconfig2(struct vo *vo, struct mp_image *img)
+{
+    int ret;
+    void *p[] = {vo, img, &ret};
     mp_dispatch_run(vo->in->dispatch, run_reconfig, p);
     return ret;
 }
@@ -630,7 +655,7 @@ void vo_control_async(struct vo *vo, int request, void *data)
 
     switch (request) {
     case VOCTRL_UPDATE_PLAYBACK_STATE:
-        d[2] = ta_xdup_ptrtype(d, (struct voctrl_playback_state *)data);
+        d[2] = talloc_dup(d, (struct voctrl_playback_state *)data);
         break;
     case VOCTRL_KILL_SCREENSAVER:
     case VOCTRL_RESTORE_SCREENSAVER:
@@ -723,13 +748,16 @@ bool vo_is_ready_for_frame(struct vo *vo, int64_t next_pts)
 {
     struct vo_internal *in = vo->in;
     pthread_mutex_lock(&in->lock);
-    bool r = vo->config_ok && !in->frame_queued &&
+    bool blocked = vo->driver->initially_blocked &&
+                   !(in->internal_events & VO_EVENT_INITIAL_UNBLOCK);
+    bool r = vo->config_ok && !in->frame_queued && !blocked &&
              (!in->current_frame || in->current_frame->num_vsyncs < 1);
     if (r && next_pts >= 0) {
         // Don't show the frame too early - it would basically freeze the
         // display by disallowing OSD redrawing or VO interaction.
-        // Actually render the frame at earliest 50ms before target time.
-        next_pts -= (uint64_t)(0.050 * 1e6);
+        // Actually render the frame at earliest the given offset before target
+        // time.
+        next_pts -= in->timing_offset;
         next_pts -= in->flip_queue_offset;
         int64_t now = mp_time_us();
         if (next_pts > now)
@@ -833,7 +861,7 @@ bool vo_render_frame_external(struct vo *vo)
 
     in->dropped_frame &= !frame->display_synced;
     in->dropped_frame &= !(vo->driver->caps & VO_CAP_FRAMEDROP);
-    in->dropped_frame &= (in->opt_framedrop & 1);
+    in->dropped_frame &= frame->can_drop;
     // Even if we're hopelessly behind, rather degrade to 10 FPS playback,
     // instead of just freezing the display forever.
     in->dropped_frame &= now - in->prev_vsync < 100 * 1000;
@@ -889,7 +917,7 @@ bool vo_render_frame_external(struct vo *vo)
         update_vsync_timing_after_swap(vo);
     }
 
-    if (vo->driver->caps & VO_CAP_NOREDRAW) {
+    if (vo->driver->caps & VO_CAP_NORETAIN) {
         talloc_free(in->current_frame);
         in->current_frame = NULL;
     }
@@ -917,7 +945,7 @@ static void do_redraw(struct vo *vo)
 {
     struct vo_internal *in = vo->in;
 
-    if (!vo->config_ok || (vo->driver->caps & VO_CAP_NOREDRAW))
+    if (!vo->config_ok || (vo->driver->caps & VO_CAP_NORETAIN))
         return;
 
     pthread_mutex_lock(&in->lock);
@@ -990,6 +1018,13 @@ void vo_disable_external_renderloop(struct vo *vo)
     in->external_renderloop_drive = false;
 }
 
+static struct mp_image *get_image_vo(void *ctx, int imgfmt, int w, int h,
+                                     int stride_align)
+{
+    struct vo *vo = ctx;
+    return vo->driver->get_image(vo, imgfmt, w, h, stride_align);
+}
+
 static void *vo_thread(void *ptr)
 {
     struct vo *vo = ptr;
@@ -998,11 +1033,15 @@ static void *vo_thread(void *ptr)
 
     mpthread_set_name("vo");
 
+    if (vo->driver->get_image)
+        in->dr_helper = dr_helper_create(in->dispatch, get_image_vo, vo);
+
     int r = vo->driver->preinit(vo) ? -1 : 0;
     mp_rendezvous(vo, r); // init barrier
     if (r < 0)
-        return NULL;
+        goto done;
 
+    read_opts(vo);
     update_display_fps(vo);
     vo_event(vo, VO_EVENT_WIN_STATE);
 
@@ -1057,7 +1096,8 @@ static void *vo_thread(void *ptr)
     talloc_free(in->current_frame);
     in->current_frame = NULL;
     vo->driver->uninit(vo);
-    assert(atomic_load(&vo->in->dr_in_flight) == 0);
+done:
+    TA_FREEP(&in->dr_helper);
     return NULL;
 }
 
@@ -1188,7 +1228,7 @@ void vo_get_src_dst_rects(struct vo *vo, struct mp_rect *out_src,
 // flip_page[_timed] will be called offset_us microseconds too early.
 // (For vo_vdpau, which does its own timing.)
 // num_req_frames set the requested number of requested vo_frame.frames.
-// (For vo_opengl interpolation.)
+// (For vo_gpu interpolation.)
 void vo_set_queue_params(struct vo *vo, int64_t offset_us, int num_req_frames)
 {
     struct vo_internal *in = vo->in;
@@ -1322,6 +1362,25 @@ struct mp_image *vo_get_current_frame(struct vo *vo)
     return r;
 }
 
+struct vo_frame *vo_get_current_vo_frame(struct vo *vo)
+{
+    struct vo_internal *in = vo->in;
+    pthread_mutex_lock(&in->lock);
+    struct vo_frame *r = vo_frame_ref(vo->in->current_frame);
+    pthread_mutex_unlock(&in->lock);
+    return r;
+}
+
+struct mp_image *vo_get_image(struct vo *vo, int imgfmt, int w, int h,
+                              int stride_align)
+{
+    if (vo->driver->get_image_ts)
+        return vo->driver->get_image_ts(vo, imgfmt, w, h, stride_align);
+    if (vo->in->dr_helper)
+        return dr_helper_get_image(vo->in->dr_helper, imgfmt, w, h, stride_align);
+    return NULL;
+}
+
 static void destroy_frame(void *p)
 {
     struct vo_frame *frame = p;
@@ -1361,88 +1420,3 @@ int lookup_keymap_table(const struct mp_keymap *map, int key)
         map++;
     return map->to;
 }
-
-struct free_dr_context {
-    struct vo *vo;
-    AVBufferRef *ref;
-};
-
-static void vo_thread_free(void *ptr)
-{
-    struct free_dr_context *ctx = ptr;
-
-    unsigned long long v = atomic_fetch_add(&ctx->vo->in->dr_in_flight, -1);
-    assert(v); // value before sub is 0 - unexpected underflow.
-
-    av_buffer_unref(&ctx->ref);
-    talloc_free(ctx);
-}
-
-static void free_dr_buffer_on_vo_thread(void *opaque, uint8_t *data)
-{
-    struct free_dr_context *ctx = opaque;
-
-    // The image could be unreffed even on the VO thread. In practice, this
-    // matters most on VO destruction.
-    if (pthread_equal(ctx->vo->in->thread, pthread_self())) {
-        vo_thread_free(ctx);
-    } else {
-        mp_dispatch_run(ctx->vo->in->dispatch, vo_thread_free, ctx);
-    }
-}
-
-struct get_image_cmd {
-    struct vo *vo;
-    int imgfmt, w, h, stride_align;
-    struct mp_image *res;
-};
-
-static void sync_get_image(void *ptr)
-{
-    struct get_image_cmd *cmd = ptr;
-    struct vo *vo = cmd->vo;
-
-    cmd->res = vo->driver->get_image(vo, cmd->imgfmt, cmd->w, cmd->h,
-                                     cmd->stride_align);
-    if (!cmd->res)
-        return;
-
-    // We require exactly 1 AVBufferRef.
-    assert(cmd->res->bufs[0]);
-    assert(!cmd->res->bufs[1]);
-
-    // Apply some magic to get it free'd on the VO thread as well. For this to
-    // work, we create a dummy-ref that aliases the original ref, which is why
-    // the original ref must be writable in the first place. (A newly allocated
-    // image should be always writable of course.)
-    assert(mp_image_is_writeable(cmd->res));
-
-    struct free_dr_context *ctx = talloc_zero(NULL, struct free_dr_context);
-    *ctx = (struct free_dr_context){
-        .vo = vo,
-        .ref = cmd->res->bufs[0],
-    };
-
-    AVBufferRef *new_ref = av_buffer_create(ctx->ref->data, ctx->ref->size,
-                                            free_dr_buffer_on_vo_thread, ctx, 0);
-    if (!new_ref)
-        abort(); // tiny malloc OOM
-
-    cmd->res->bufs[0] = new_ref;
-
-    atomic_fetch_add(&vo->in->dr_in_flight, 1);
-}
-
-struct mp_image *vo_get_image(struct vo *vo, int imgfmt, int w, int h,
-                              int stride_align)
-{
-    if (!vo->driver->get_image)
-        return NULL;
-
-    struct get_image_cmd cmd = {
-        .vo = vo,
-        .imgfmt = imgfmt, .w = w, .h = h, .stride_align = stride_align,
-    };
-    mp_dispatch_run(vo->in->dispatch, sync_get_image, &cmd);
-    return cmd.res;
-}
diff --git a/video/out/vo.h b/video/out/vo.h
index 995d6b9..3c00bb9 100644
--- a/video/out/vo.h
+++ b/video/out/vo.h
@@ -45,10 +45,13 @@ enum {
     VO_EVENT_LIVE_RESIZING              = 1 << 5,
     // Window fullscreen state changed via external influence.
     VO_EVENT_FULLSCREEN_STATE           = 1 << 6,
+    // Special thing for encode mode (vo_driver.initially_blocked).
+    // Part of VO_EVENTS_USER to make vo_is_ready_for_frame() work properly.
+    VO_EVENT_INITIAL_UNBLOCK            = 1 << 7,
 
     // Set of events the player core may be interested in.
     VO_EVENTS_USER = VO_EVENT_RESIZE | VO_EVENT_WIN_STATE |
-                     VO_EVENT_FULLSCREEN_STATE,
+                     VO_EVENT_FULLSCREEN_STATE | VO_EVENT_INITIAL_UNBLOCK,
 };
 
 enum mp_voctrl {
@@ -65,7 +68,7 @@ enum mp_voctrl {
     VOCTRL_SET_EQUALIZER,               // struct voctrl_set_equalizer_args*
     VOCTRL_GET_EQUALIZER,               // struct voctrl_get_equalizer_args*
 
-    /* private to vo_opengl */
+    /* private to vo_gpu */
     VOCTRL_LOAD_HWDEC_API,
 
     // Redraw the image previously passed to draw_image() (basically, repeat
@@ -73,6 +76,11 @@ enum mp_voctrl {
     // be updated and redrawn. Optional; emulated if not available.
     VOCTRL_REDRAW_FRAME,
 
+    // Only used internally in vo_opengl_cb
+    VOCTRL_PREINIT,
+    VOCTRL_UNINIT,
+    VOCTRL_RECONFIG,
+
     VOCTRL_FULLSCREEN,
     VOCTRL_ONTOP,
     VOCTRL_BORDER,
@@ -102,8 +110,13 @@ enum mp_voctrl {
     VOCTRL_GET_DISPLAY_NAMES,
 
     // Retrieve window contents. (Normal screenshots use vo_get_current_frame().)
+    // Deprecated for VOCTRL_SCREENSHOT with corresponding flags.
     VOCTRL_SCREENSHOT_WIN,              // struct mp_image**
 
+    // A normal screenshot - VOs can react to this if vo_get_current_frame() is
+    // not sufficient.
+    VOCTRL_SCREENSHOT,                  // struct voctrl_screenshot*
+
     VOCTRL_UPDATE_RENDER_OPTS,
 
     VOCTRL_GET_ICC_PROFILE,             // bstr*
@@ -111,6 +124,9 @@ enum mp_voctrl {
     VOCTRL_GET_DISPLAY_FPS,             // double*
 
     VOCTRL_GET_PREF_DEINT,              // int*
+
+    /* private to vo_gpu */
+    VOCTRL_EXTERNAL_RESIZE,
 };
 
 // VOCTRL_SET_EQUALIZER
@@ -167,13 +183,18 @@ struct voctrl_performance_data {
     struct mp_frame_perf fresh, redraw;
 };
 
+struct voctrl_screenshot {
+    bool scaled, subs, osd, high_bit_depth;
+    struct mp_image *res;
+};
+
 enum {
     // VO does handle mp_image_params.rotate in 90 degree steps
     VO_CAP_ROTATE90     = 1 << 0,
     // VO does framedrop itself (vo_vdpau). Untimed/encoding VOs never drop.
     VO_CAP_FRAMEDROP    = 1 << 1,
-    // VO does not support redraws (vo_mediacodec_embed).
-    VO_CAP_NOREDRAW     = 1 << 2,
+    // VO does not allow frames to be retained (vo_mediacodec_embed).
+    VO_CAP_NORETAIN     = 1 << 2,
 };
 
 #define VO_MAX_REQ_FRAMES 10
@@ -187,7 +208,6 @@ struct vo_extra {
     struct input_ctx *input_ctx;
     struct osd_state *osd;
     struct encode_lavc_context *encode_lavc_ctx;
-    struct mpv_opengl_cb_context *opengl_cb_context;
     void (*wakeup_cb)(void *ctx);
     void *wakeup_ctx;
 };
@@ -219,6 +239,8 @@ struct vo_frame {
     bool still;
     // Frames are output as fast as possible, with implied vsync blocking.
     bool display_synced;
+    // Dropping the frame is allowed if the VO is behind.
+    bool can_drop;
     // The current frame to be drawn.
     // Warning: When OSD should be redrawn in --force-window --idle mode, this
     //          can be NULL. The VO should draw a black background, OSD on top.
@@ -245,6 +267,12 @@ struct vo_driver {
     // Encoding functionality, which can be invoked via --o only.
     bool encode;
 
+    // This requires waiting for a VO_EVENT_INITIAL_UNBLOCK event before the
+    // first frame can be sent. Doing vo_reconfig*() calls is allowed though.
+    // Encode mode uses this, the core uses vo_is_ready_for_frame() to
+    // implicitly check for this.
+    bool initially_blocked;
+
     // VO_CAP_* bits
     int caps;
 
@@ -274,6 +302,12 @@ struct vo_driver {
     int (*reconfig)(struct vo *vo, struct mp_image_params *params);
 
     /*
+     * Like reconfig(), but provides the whole mp_image for which the change is
+     * required. (The image doesn't have to have real data.)
+     */
+    int (*reconfig2)(struct vo *vo, struct mp_image *img);
+
+    /*
      * Control interface
      */
     int (*control)(struct vo *vo, uint32_t request, void *data);
@@ -309,6 +343,14 @@ struct vo_driver {
                                   int stride_align);
 
     /*
+     * Thread-safe variant of get_image. Set at most one of these callbacks.
+     * This excludes _all_ synchronization magic. The only guarantee is that
+     * vo_driver.uninit is not called before this function returns.
+     */
+    struct mp_image *(*get_image_ts)(struct vo *vo, int imgfmt, int w, int h,
+                                     int stride_align);
+
+    /*
      * Render the given frame to the VO's backbuffer. This operation will be
      * followed by a draw_osd and a flip_page[_timed] call.
      * mpi belongs to the VO; the VO must free it eventually.
@@ -321,6 +363,9 @@ struct vo_driver {
 
     /* Render the given frame. Note that this is also called when repeating
      * or redrawing frames.
+     *
+     * frame is freed by the caller, but the callee can still modify the
+     * contained data and references.
      */
     void (*draw_frame)(struct vo *vo, struct vo_frame *frame);
 
@@ -413,6 +458,7 @@ struct vo {
 struct mpv_global;
 struct vo *init_best_video_out(struct mpv_global *global, struct vo_extra *ex);
 int vo_reconfig(struct vo *vo, struct mp_image_params *p);
+int vo_reconfig2(struct vo *vo, struct mp_image *img);
 
 int vo_control(struct vo *vo, int request, void *data);
 void vo_control_async(struct vo *vo, int request, void *data);
@@ -444,6 +490,7 @@ double vo_get_estimated_vsync_jitter(struct vo *vo);
 double vo_get_display_fps(struct vo *vo);
 double vo_get_delay(struct vo *vo);
 void vo_discard_timing_info(struct vo *vo);
+struct vo_frame *vo_get_current_vo_frame(struct vo *vo);
 struct mp_image *vo_get_image(struct vo *vo, int imgfmt, int w, int h,
                               int stride_align);
 
diff --git a/video/out/vo_drm.c b/video/out/vo_drm.c
index 24189d5..7f52901 100644
--- a/video/out/vo_drm.c
+++ b/video/out/vo_drm.c
@@ -41,6 +41,9 @@
 #define USE_MASTER 0
 #define BUF_COUNT 2
 
+// Modulo that works correctly for negative numbers
+#define MOD(a,b) ((((a)%(b))+(b))%(b))
+
 struct framebuffer {
     uint32_t width;
     uint32_t height;
@@ -71,6 +74,7 @@ struct priv {
     int32_t screen_h;
     struct mp_image *last_input;
     struct mp_image *cur_frame;
+    struct mp_image *cur_frame_cropped;
     struct mp_rect src;
     struct mp_rect dst;
     struct mp_osd_res osd;
@@ -149,8 +153,8 @@ static bool fb_setup_double_buffering(struct vo *vo)
 
     p->front_buf = 0;
     for (unsigned int i = 0; i < 2; i++) {
-        p->bufs[i].width = p->kms->mode.hdisplay;
-        p->bufs[i].height = p->kms->mode.vdisplay;
+        p->bufs[i].width = p->kms->mode.mode.hdisplay;
+        p->bufs[i].height = p->kms->mode.mode.vdisplay;
     }
 
     for (unsigned int i = 0; i < BUF_COUNT; i++) {
@@ -180,9 +184,9 @@ static bool crtc_setup(struct vo *vo)
         return true;
     p->old_crtc = drmModeGetCrtc(p->kms->fd, p->kms->crtc_id);
     int ret = drmModeSetCrtc(p->kms->fd, p->kms->crtc_id,
-                             p->bufs[p->front_buf + BUF_COUNT - 1].fb,
+                             p->bufs[MOD(p->front_buf - 1, BUF_COUNT)].fb,
                              0, 0, &p->kms->connector->connector_id, 1,
-                             &p->kms->mode);
+                             &p->kms->mode.mode);
     p->active = true;
     return ret == 0;
 }
@@ -273,17 +277,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     int w = p->dst.x1 - p->dst.x0;
     int h = p->dst.y1 - p->dst.y0;
 
-    // p->osd contains the parameters assuming OSD rendering in window
-    // coordinates, but OSD can only be rendered in the intersection
-    // between window and video rectangle (i.e. not into panscan borders).
-    p->osd.w = w;
-    p->osd.h = h;
-    p->osd.mt = MPMIN(0, p->osd.mt);
-    p->osd.mb = MPMIN(0, p->osd.mb);
-    p->osd.mr = MPMIN(0, p->osd.mr);
-    p->osd.ml = MPMIN(0, p->osd.ml);
-
-    mp_sws_set_from_cmdline(p->sws, vo->opts->sws_opts);
+    mp_sws_set_from_cmdline(p->sws, vo->global);
     p->sws->src = *params;
     p->sws->dst = (struct mp_image_params) {
         .imgfmt = IMGFMT,
@@ -297,6 +291,15 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     p->cur_frame = mp_image_alloc(IMGFMT, p->screen_w, p->screen_h);
     mp_image_params_guess_csp(&p->sws->dst);
     mp_image_set_params(p->cur_frame, &p->sws->dst);
+    p->cur_frame[0].w = p->screen_w;
+    p->cur_frame[0].h = p->screen_h;
+
+    talloc_free(p->cur_frame_cropped);
+    p->cur_frame_cropped = mp_image_new_dummy_ref(p->cur_frame);
+    mp_image_crop_rc(p->cur_frame_cropped, p->dst);
+
+    talloc_free(p->last_input);
+    p->last_input = NULL;
 
     struct framebuffer *buf = p->bufs;
     for (unsigned int i = 0; i < BUF_COUNT; i++)
@@ -320,7 +323,13 @@ static void draw_image(struct vo *vo, mp_image_t *mpi)
             src_rc.x0 = MP_ALIGN_DOWN(src_rc.x0, mpi->fmt.align_x);
             src_rc.y0 = MP_ALIGN_DOWN(src_rc.y0, mpi->fmt.align_y);
             mp_image_crop_rc(&src, src_rc);
-            mp_sws_scale(p->sws, p->cur_frame, &src);
+
+            mp_image_clear(p->cur_frame, 0, 0, p->cur_frame->w, p->dst.y0);
+            mp_image_clear(p->cur_frame, 0, p->dst.y1, p->cur_frame->w, p->cur_frame->h);
+            mp_image_clear(p->cur_frame, 0, p->dst.y0, p->dst.x0, p->dst.y1);
+            mp_image_clear(p->cur_frame, p->dst.x1, p->dst.y0, p->cur_frame->w, p->dst.y1);
+
+            mp_sws_scale(p->sws, p->cur_frame_cropped, &src);
             osd_draw_on_image(vo->osd, p->osd, src.pts, 0, p->cur_frame);
         } else {
             mp_image_clear(p->cur_frame, 0, 0, p->cur_frame->w, p->cur_frame->h);
@@ -328,13 +337,9 @@ static void draw_image(struct vo *vo, mp_image_t *mpi)
         }
 
         struct framebuffer *front_buf = &p->bufs[p->front_buf];
-        int w = p->dst.x1 - p->dst.x0;
-        int h = p->dst.y1 - p->dst.y0;
-        int x = (p->screen_w - w) >> 1;
-        int y = (p->screen_h - h) >> 1;
-        int shift = y * front_buf->stride + x * BYTES_PER_PIXEL;
-        memcpy_pic(front_buf->map + shift, p->cur_frame->planes[0],
-                   w * BYTES_PER_PIXEL, h, front_buf->stride,
+        memcpy_pic(front_buf->map, p->cur_frame->planes[0],
+                   p->cur_frame->w * BYTES_PER_PIXEL, p->cur_frame->h,
+                   front_buf->stride,
                    p->cur_frame->stride[0]);
     }
 
@@ -354,7 +359,7 @@ static void flip_page(struct vo *vo)
                               p->bufs[p->front_buf].fb,
                               DRM_MODE_PAGE_FLIP_EVENT, p);
     if (ret) {
-        MP_WARN(vo, "Cannot flip page for connector\n");
+        MP_WARN(vo, "Failed to queue page flip: %s\n", mp_strerror(errno));
     } else {
         p->front_buf++;
         p->front_buf %= BUF_COUNT;
@@ -394,6 +399,7 @@ static void uninit(struct vo *vo)
 
     talloc_free(p->last_input);
     talloc_free(p->cur_frame);
+    talloc_free(p->cur_frame_cropped);
 }
 
 static int preinit(struct vo *vo)
@@ -414,7 +420,8 @@ static int preinit(struct vo *vo)
     p->kms = kms_create(
         vo->log, vo->opts->drm_opts->drm_connector_spec,
                  vo->opts->drm_opts->drm_mode_id,
-                 vo->opts->drm_opts->drm_overlay_id);
+                 vo->opts->drm_opts->drm_osd_plane_id,
+                 vo->opts->drm_opts->drm_video_plane_id);
     if (!p->kms) {
         MP_ERR(vo, "Failed to create KMS.\n");
         goto err;
@@ -440,6 +447,14 @@ static int preinit(struct vo *vo)
         goto err;
     }
 
+    if (vo->opts->force_monitor_aspect != 0.0) {
+        vo->monitor_par = p->screen_w / (double) p->screen_h /
+                          vo->opts->force_monitor_aspect;
+    } else {
+        vo->monitor_par = 1 / vo->opts->monitor_pixel_aspect;
+    }
+    mp_verbose(vo->log, "Monitor pixel aspect: %g\n", vo->monitor_par);
+
     return 0;
 
 err:
diff --git a/video/out/vo_gpu.c b/video/out/vo_gpu.c
index 95318d3..a80ba23 100644
--- a/video/out/vo_gpu.c
+++ b/video/out/vo_gpu.c
@@ -84,7 +84,7 @@ static void draw_frame(struct vo *vo, struct vo_frame *frame)
     if (!sw->fns->start_frame(sw, &fbo))
         return;
 
-    gl_video_render_frame(p->renderer, frame, fbo);
+    gl_video_render_frame(p->renderer, frame, fbo, RENDER_FRAME_DEF);
     if (!sw->fns->submit_frame(sw, frame)) {
         MP_ERR(vo, "Failed presenting frame!\n");
         return;
@@ -168,7 +168,6 @@ static void get_and_update_ambient_lighting(struct gpu_priv *p)
 static int control(struct vo *vo, uint32_t request, void *data)
 {
     struct gpu_priv *p = vo->priv;
-    struct ra_swapchain *sw = p->ctx->swapchain;
 
     switch (request) {
     case VOCTRL_SET_PANSCAN:
@@ -177,15 +176,11 @@ static int control(struct vo *vo, uint32_t request, void *data)
     case VOCTRL_SET_EQUALIZER:
         vo->want_redraw = true;
         return VO_TRUE;
-    case VOCTRL_SCREENSHOT_WIN: {
-        struct mp_image *screen = NULL;
-        if (sw->fns->screenshot)
-            screen = sw->fns->screenshot(sw);
-        if (!screen)
-            break; // redirect to backend
-        // set image parameters according to the display, if possible
-        screen->params.color = gl_video_get_output_colorspace(p->renderer);
-        *(struct mp_image **)data = screen;
+    case VOCTRL_SCREENSHOT: {
+        struct vo_frame *frame = vo_get_current_vo_frame(vo);
+        if (frame)
+            gl_video_screenshot(p->renderer, frame, data);
+        talloc_free(frame);
         return true;
     }
     case VOCTRL_LOAD_HWDEC_API:
@@ -207,6 +202,10 @@ static int control(struct vo *vo, uint32_t request, void *data)
     case VOCTRL_PERFORMANCE_DATA:
         gl_video_perfdata(p->renderer, (struct voctrl_performance_data *)data);
         return true;
+    case VOCTRL_EXTERNAL_RESIZE:
+        p->ctx->fns->reconfig(p->ctx);
+        resize(vo);
+        return true;
     }
 
     int events = 0;
diff --git a/video/out/vo_lavc.c b/video/out/vo_lavc.c
index 4b69231..e817b53 100644
--- a/video/out/vo_lavc.c
+++ b/video/out/vo_lavc.c
@@ -36,89 +36,58 @@
 #include "sub/osd.h"
 
 struct priv {
-    AVStream *stream;
-    AVCodecContext *codec;
-    int have_first_packet;
-
-    int harddup;
-
-    double lastpts;
-    int64_t lastipts;
-    int64_t lastframeipts;
-    int64_t lastencodedipts;
-    int64_t mindeltapts;
-    double expected_next_pts;
-    mp_image_t *lastimg;
-    int lastdisplaycount;
-
-    AVRational worst_time_base;
-    int worst_time_base_is_stream;
+    struct encoder_context *enc;
 
     bool shutdown;
 };
 
 static int preinit(struct vo *vo)
 {
-    struct priv *vc;
-    if (!encode_lavc_available(vo->encode_lavc_ctx)) {
-        MP_ERR(vo, "the option --o (output file) must be specified\n");
+    struct priv *vc = vo->priv;
+    vc->enc = encoder_context_alloc(vo->encode_lavc_ctx, STREAM_VIDEO, vo->log);
+    if (!vc->enc)
         return -1;
-    }
-    vo->priv = talloc_zero(vo, struct priv);
-    vc = vo->priv;
-    vc->harddup = vo->encode_lavc_ctx->options->harddup;
+    talloc_steal(vc, vc->enc);
     return 0;
 }
 
-static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi);
 static void uninit(struct vo *vo)
 {
     struct priv *vc = vo->priv;
-    if (!vc || vc->shutdown)
-        return;
-
-    pthread_mutex_lock(&vo->encode_lavc_ctx->lock);
-
-    if (vc->lastipts >= 0 && vc->stream)
-        draw_image_unlocked(vo, NULL);
+    struct encoder_context *enc = vc->enc;
 
-    mp_image_unrefp(&vc->lastimg);
+    if (!vc->shutdown)
+        encoder_encode(enc, NULL); // finish encoding
+}
 
-    pthread_mutex_unlock(&vo->encode_lavc_ctx->lock);
+static void on_ready(void *ptr)
+{
+    struct vo *vo = ptr;
 
-    vc->shutdown = true;
+    vo_event(vo, VO_EVENT_INITIAL_UNBLOCK);
 }
 
-static int reconfig(struct vo *vo, struct mp_image_params *params)
+static int reconfig2(struct vo *vo, struct mp_image *img)
 {
     struct priv *vc = vo->priv;
+    AVCodecContext *encoder = vc->enc->encoder;
+
+    struct mp_image_params *params = &img->params;
     enum AVPixelFormat pix_fmt = imgfmt2pixfmt(params->imgfmt);
     AVRational aspect = {params->p_w, params->p_h};
-    uint32_t width = params->w;
-    uint32_t height = params->h;
+    int width = params->w;
+    int height = params->h;
 
-    if (!vc || vc->shutdown)
+    if (vc->shutdown)
         return -1;
 
-    pthread_mutex_lock(&vo->encode_lavc_ctx->lock);
-
-    if (vc->stream) {
-        /* NOTE:
-         * in debug builds we get a "comparison between signed and unsigned"
-         * warning here. We choose to ignore that; just because ffmpeg currently
-         * uses a plain 'int' for these struct fields, it doesn't mean it always
-         * will */
-        if (width == vc->codec->width &&
-                height == vc->codec->height) {
-            if (aspect.num != vc->codec->sample_aspect_ratio.num ||
-                    aspect.den != vc->codec->sample_aspect_ratio.den) {
-                /* aspect-only changes are not critical */
-                MP_WARN(vo, "unsupported pixel aspect ratio change from %d:%d to %d:%d\n",
-                       vc->codec->sample_aspect_ratio.num,
-                       vc->codec->sample_aspect_ratio.den,
-                       aspect.num, aspect.den);
-            }
-            goto done;
+    if (avcodec_is_open(encoder)) {
+        if (width == encoder->width && height == encoder->height &&
+            pix_fmt == encoder->pix_fmt)
+        {
+            // consider these changes not critical
+            MP_ERR(vo, "Ignoring mid-stream parameter changes!\n");
+            return 0;
         }
 
         /* FIXME Is it possible with raw video? */
@@ -132,11 +101,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     // - Second calls after reconfigure() already failed once fail (due to the
     //   vc->shutdown check above).
     // - Second calls after reconfigure() already succeeded once return early
-    //   (due to the vc->stream check above).
-
-    vc->lastipts = AV_NOPTS_VALUE;
-    vc->lastframeipts = AV_NOPTS_VALUE;
-    vc->lastencodedipts = AV_NOPTS_VALUE;
+    //   (due to the avcodec_is_open() check above).
 
     if (pix_fmt == AV_PIX_FMT_NONE) {
         MP_FATAL(vo, "Format %s not supported by lavc.\n",
@@ -144,342 +109,122 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
         goto error;
     }
 
-    if (encode_lavc_alloc_stream(vo->encode_lavc_ctx,
-                                 AVMEDIA_TYPE_VIDEO,
-                                 &vc->stream, &vc->codec) < 0)
-        goto error;
-    vc->stream->sample_aspect_ratio = vc->codec->sample_aspect_ratio =
-            aspect;
-    vc->codec->width = width;
-    vc->codec->height = height;
-    vc->codec->pix_fmt = pix_fmt;
+    encoder->sample_aspect_ratio = aspect;
+    encoder->width = width;
+    encoder->height = height;
+    encoder->pix_fmt = pix_fmt;
+    encoder->colorspace = mp_csp_to_avcol_spc(params->color.space);
+    encoder->color_range = mp_csp_levels_to_avcol_range(params->color.levels);
+
+    AVRational tb;
+
+    // we want to handle:
+    //      1/25
+    //   1001/24000
+    //   1001/30000
+    // for this we would need 120000fps...
+    // however, mpeg-4 only allows 16bit values
+    // so let's take 1001/30000 out
+    tb.num = 24000;
+    tb.den = 1;
 
-    encode_lavc_set_csp(vo->encode_lavc_ctx, vc->codec, params->color.space);
-    encode_lavc_set_csp_levels(vo->encode_lavc_ctx, vc->codec, params->color.levels);
+    const AVRational *rates = encoder->codec->supported_framerates;
+    if (rates && rates[0].den)
+        tb = rates[av_find_nearest_q_idx(tb, rates)];
 
-    if (encode_lavc_open_codec(vo->encode_lavc_ctx, vc->codec) < 0)
+    encoder->time_base = av_inv_q(tb);
+
+    if (!encoder_init_codec_and_muxer(vc->enc, on_ready, vo))
         goto error;
 
-done:
-    pthread_mutex_unlock(&vo->encode_lavc_ctx->lock);
     return 0;
 
 error:
-    pthread_mutex_unlock(&vo->encode_lavc_ctx->lock);
     vc->shutdown = true;
     return -1;
 }
 
 static int query_format(struct vo *vo, int format)
 {
-    enum AVPixelFormat pix_fmt = imgfmt2pixfmt(format);
-
-    if (!vo->encode_lavc_ctx)
-        return 0;
-
-    pthread_mutex_lock(&vo->encode_lavc_ctx->lock);
-    int flags = 0;
-    if (encode_lavc_supports_pixfmt(vo->encode_lavc_ctx, pix_fmt))
-        flags = 1;
-    pthread_mutex_unlock(&vo->encode_lavc_ctx->lock);
-    return flags;
-}
-
-static void write_packet(struct vo *vo, AVPacket *packet)
-{
     struct priv *vc = vo->priv;
 
-    packet->stream_index = vc->stream->index;
-    if (packet->pts != AV_NOPTS_VALUE) {
-        packet->pts = av_rescale_q(packet->pts,
-                                   vc->codec->time_base,
-                                   vc->stream->time_base);
-    } else {
-        MP_VERBOSE(vo, "codec did not provide pts\n");
-        packet->pts = av_rescale_q(vc->lastipts,
-                                   vc->worst_time_base,
-                                   vc->stream->time_base);
-    }
-    if (packet->dts != AV_NOPTS_VALUE) {
-        packet->dts = av_rescale_q(packet->dts,
-                                   vc->codec->time_base,
-                                   vc->stream->time_base);
-    }
-    if (packet->duration > 0) {
-        packet->duration = av_rescale_q(packet->duration,
-                                        vc->codec->time_base,
-                                        vc->stream->time_base);
-    } else {
-        // HACK: libavformat calculates dts wrong if the initial packet
-        // duration is not set, but ONLY if the time base is "high" and if we
-        // have b-frames!
-        if (!packet->duration)
-            if (!vc->have_first_packet)
-                if (vc->codec->has_b_frames
-                        || vc->codec->max_b_frames)
-                    if (vc->stream->time_base.num * 1000LL <=
-                            vc->stream->time_base.den)
-                        packet->duration = FFMAX(1, av_rescale_q(1,
-                             vc->codec->time_base, vc->stream->time_base));
-    }
-
-    if (encode_lavc_write_frame(vo->encode_lavc_ctx,
-                                vc->stream, packet) < 0) {
-        MP_ERR(vo, "error writing at %d %d/%d\n",
-               (int) packet->pts,
-               vc->stream->time_base.num,
-               vc->stream->time_base.den);
-        return;
-    }
+    enum AVPixelFormat pix_fmt = imgfmt2pixfmt(format);
+    const enum AVPixelFormat *p = vc->enc->encoder->codec->pix_fmts;
 
-    vc->have_first_packet = 1;
-}
+    if (!p)
+        return 1;
 
-static void encode_video_and_write(struct vo *vo, AVFrame *frame)
-{
-    struct priv *vc = vo->priv;
-    AVPacket packet = {0};
-
-    int status = avcodec_send_frame(vc->codec, frame);
-    if (status < 0) {
-        MP_ERR(vo, "error encoding at %d %d/%d\n",
-               frame ? (int) frame->pts : -1,
-               vc->codec->time_base.num,
-               vc->codec->time_base.den);
-        return;
-    }
-    for (;;) {
-        av_init_packet(&packet);
-        status = avcodec_receive_packet(vc->codec, &packet);
-        if (status == AVERROR(EAGAIN)) { // No more packets for now.
-            if (frame == NULL) {
-                MP_ERR(vo, "sent flush frame, got EAGAIN");
-            }
-            break;
-        }
-        if (status == AVERROR_EOF) { // No more packets, ever.
-            if (frame != NULL) {
-                MP_ERR(vo, "sent image frame, got EOF");
-            }
-            break;
-        }
-        if (status < 0) {
-            MP_ERR(vo, "error encoding at %d %d/%d\n",
-                   frame ? (int) frame->pts : -1,
-                   vc->codec->time_base.num,
-                   vc->codec->time_base.den);
-            break;
-        }
-        encode_lavc_write_stats(vo->encode_lavc_ctx, vc->codec);
-        write_packet(vo, &packet);
-        av_packet_unref(&packet);
+    while (*p != AV_PIX_FMT_NONE) {
+        if (*p == pix_fmt)
+            return 1;
+        p++;
     }
+
+    return 0;
 }
 
-static void draw_image_unlocked(struct vo *vo, mp_image_t *mpi)
+static void draw_frame(struct vo *vo, struct vo_frame *voframe)
 {
     struct priv *vc = vo->priv;
-    struct encode_lavc_context *ectx = vo->encode_lavc_ctx;
-    AVCodecContext *avc;
-    int64_t frameipts;
-    double nextpts;
-
-    double pts = mpi ? mpi->pts : MP_NOPTS_VALUE;
+    struct encoder_context *enc = vc->enc;
+    struct encode_lavc_context *ectx = enc->encode_lavc_ctx;
+    AVCodecContext *avc = enc->encoder;
 
-    if (mpi) {
-        assert(vo->params);
-
-        struct mp_osd_res dim = osd_res_from_image_params(vo->params);
+    if (voframe->redraw || voframe->repeat || voframe->num_frames < 1)
+        return;
 
-        osd_draw_on_image(vo->osd, dim, mpi->pts, OSD_DRAW_SUB_ONLY, mpi);
-    }
+    struct mp_image *mpi = voframe->frames[0];
 
-    if (!vc || vc->shutdown)
-        goto done;
-    if (!encode_lavc_start(ectx)) {
-        MP_WARN(vo, "NOTE: skipped initial video frame (probably because audio is not there yet)\n");
-        goto done;
-    }
-    if (pts == MP_NOPTS_VALUE) {
-        if (mpi)
-            MP_WARN(vo, "frame without pts, please report; synthesizing pts instead\n");
-        pts = vc->expected_next_pts;
-    }
+    struct mp_osd_res dim = osd_res_from_image_params(vo->params);
+    osd_draw_on_image(vo->osd, dim, mpi->pts, OSD_DRAW_SUB_ONLY, mpi);
 
-    avc = vc->codec;
-
-    if (vc->worst_time_base.den == 0) {
-        //if (avc->time_base.num / avc->time_base.den >= vc->stream->time_base.num / vc->stream->time_base.den)
-        if (avc->time_base.num * (double) vc->stream->time_base.den >=
-                vc->stream->time_base.num * (double) avc->time_base.den) {
-            MP_VERBOSE(vo, "NOTE: using codec time base "
-                       "(%d/%d) for frame dropping; the stream base (%d/%d) is "
-                       "not worse.\n", (int)avc->time_base.num,
-                       (int)avc->time_base.den, (int)vc->stream->time_base.num,
-                       (int)vc->stream->time_base.den);
-            vc->worst_time_base = avc->time_base;
-            vc->worst_time_base_is_stream = 0;
-        } else {
-            MP_WARN(vo, "NOTE: not using codec time base (%d/%d) for frame "
-                    "dropping; the stream base (%d/%d) is worse.\n",
-                    (int)avc->time_base.num, (int)avc->time_base.den,
-                    (int)vc->stream->time_base.num, (int)vc->stream->time_base.den);
-            vc->worst_time_base = vc->stream->time_base;
-            vc->worst_time_base_is_stream = 1;
-        }
-        if (ectx->options->maxfps)
-            vc->mindeltapts = ceil(vc->worst_time_base.den /
-                    (vc->worst_time_base.num * ectx->options->maxfps));
-        else
-            vc->mindeltapts = 0;
-
-        // NOTE: we use the following "axiom" of av_rescale_q:
-        // if time base A is worse than time base B, then
-        //   av_rescale_q(av_rescale_q(x, A, B), B, A) == x
-        // this can be proven as long as av_rescale_q rounds to nearest, which
-        // it currently does
-
-        // av_rescale_q(x, A, B) * B = "round x*A to nearest multiple of B"
-        // and:
-        //    av_rescale_q(av_rescale_q(x, A, B), B, A) * A
-        // == "round av_rescale_q(x, A, B)*B to nearest multiple of A"
-        // == "round 'round x*A to nearest multiple of B' to nearest multiple of A"
-        //
-        // assume this fails. Then there is a value of x*A, for which the
-        // nearest multiple of B is outside the range [(x-0.5)*A, (x+0.5)*A[.
-        // Absurd, as this range MUST contain at least one multiple of B.
-    }
+    if (vc->shutdown)
+        return;
 
-    double timeunit = (double)vc->worst_time_base.num / vc->worst_time_base.den;
+    // Lock for shared timestamp fields.
+    pthread_mutex_lock(&ectx->lock);
 
-    double outpts;
-    if (ectx->options->rawts)
-        outpts = pts;
-    else if (ectx->options->copyts) {
+    double pts = mpi->pts;
+    double outpts = pts;
+    if (!enc->options->rawts) {
         // fix the discontinuity pts offset
-        nextpts = pts;
         if (ectx->discontinuity_pts_offset == MP_NOPTS_VALUE) {
-            ectx->discontinuity_pts_offset = ectx->next_in_pts - nextpts;
-        }
-        else if (fabs(nextpts + ectx->discontinuity_pts_offset - ectx->next_in_pts) > 30) {
+            ectx->discontinuity_pts_offset = ectx->next_in_pts - pts;
+        } else if (fabs(pts + ectx->discontinuity_pts_offset -
+                        ectx->next_in_pts) > 30)
+        {
             MP_WARN(vo, "detected an unexpected discontinuity (pts jumped by "
                     "%f seconds)\n",
-                    nextpts + ectx->discontinuity_pts_offset - ectx->next_in_pts);
-            ectx->discontinuity_pts_offset = ectx->next_in_pts - nextpts;
+                    pts + ectx->discontinuity_pts_offset - ectx->next_in_pts);
+            ectx->discontinuity_pts_offset = ectx->next_in_pts - pts;
         }
 
         outpts = pts + ectx->discontinuity_pts_offset;
     }
-    else {
-        // adjust pts by knowledge of audio pts vs audio playback time
-        double duration = 0;
-        if (ectx->last_video_in_pts != MP_NOPTS_VALUE)
-            duration = pts - ectx->last_video_in_pts;
-        if (duration < 0)
-            duration = timeunit;   // XXX warn about discontinuity?
-        outpts = vc->lastpts + duration;
-        if (ectx->audio_pts_offset != MP_NOPTS_VALUE) {
-            double adj = outpts - pts - ectx->audio_pts_offset;
-            adj = FFMIN(adj, duration * 0.1);
-            adj = FFMAX(adj, -duration * 0.1);
-            outpts -= adj;
-        }
-    }
-    vc->lastpts = outpts;
-    ectx->last_video_in_pts = pts;
-    frameipts = floor((outpts + encode_lavc_getoffset(ectx, vc->codec))
-                      / timeunit + 0.5);
 
-    // calculate expected pts of next video frame
-    vc->expected_next_pts = pts + timeunit;
+    outpts += encoder_get_offset(enc);
 
-    if (!ectx->options->rawts && ectx->options->copyts) {
+    if (!enc->options->rawts) {
+        // calculate expected pts of next video frame
+        double timeunit = av_q2d(avc->time_base);
+        double expected_next_pts = pts + timeunit;
         // set next allowed output pts value
-        nextpts = vc->expected_next_pts + ectx->discontinuity_pts_offset;
+        double nextpts = expected_next_pts + ectx->discontinuity_pts_offset;
         if (nextpts > ectx->next_in_pts)
             ectx->next_in_pts = nextpts;
     }
 
-    // never-drop mode
-    if (ectx->options->neverdrop) {
-        int64_t step = vc->mindeltapts ? vc->mindeltapts : 1;
-        if (frameipts < vc->lastipts + step) {
-            MP_INFO(vo, "--oneverdrop increased pts by %d\n",
-                    (int) (vc->lastipts - frameipts + step));
-            frameipts = vc->lastipts + step;
-            vc->lastpts = frameipts * timeunit - encode_lavc_getoffset(ectx, vc->codec);
-        }
-    }
+    pthread_mutex_unlock(&ectx->lock);
 
-    if (vc->lastipts != AV_NOPTS_VALUE) {
-
-        // we have a valid image in lastimg
-        while (vc->lastimg && vc->lastipts < frameipts) {
-            int64_t thisduration = vc->harddup ? 1 : (frameipts - vc->lastipts);
-
-            // we will ONLY encode this frame if it can be encoded at at least
-            // vc->mindeltapts after the last encoded frame!
-            int64_t skipframes =
-                (vc->lastencodedipts == AV_NOPTS_VALUE)
-                    ? 0
-                    : vc->lastencodedipts + vc->mindeltapts - vc->lastipts;
-            if (skipframes < 0)
-                skipframes = 0;
-
-            if (thisduration > skipframes) {
-                AVFrame *frame = mp_image_to_av_frame(vc->lastimg);
-                if (!frame)
-                    abort();
-
-                // this is a nop, unless the worst time base is the STREAM time base
-                frame->pts = av_rescale_q(vc->lastipts + skipframes,
-                                          vc->worst_time_base, avc->time_base);
-                frame->pict_type = 0; // keep this at unknown/undefined
-                frame->quality = avc->global_quality;
-                encode_video_and_write(vo, frame);
-                av_frame_free(&frame);
-
-                ++vc->lastdisplaycount;
-                vc->lastencodedipts = vc->lastipts + skipframes;
-            }
-
-            vc->lastipts += thisduration;
-        }
-    }
-
-    if (!mpi) {
-        // finish encoding
-        encode_video_and_write(vo, NULL);
-    } else {
-        if (frameipts >= vc->lastframeipts) {
-            if (vc->lastframeipts != AV_NOPTS_VALUE && vc->lastdisplaycount != 1)
-                MP_INFO(vo, "Frame at pts %d got displayed %d times\n",
-                        (int) vc->lastframeipts, vc->lastdisplaycount);
-            talloc_free(vc->lastimg);
-            vc->lastimg = mpi;
-            mpi = NULL;
-
-            vc->lastframeipts = vc->lastipts = frameipts;
-            if (ectx->options->rawts && vc->lastipts < 0) {
-                MP_ERR(vo, "why does this happen? DEBUG THIS! vc->lastipts = %lld\n", (long long) vc->lastipts);
-                vc->lastipts = -1;
-            }
-            vc->lastdisplaycount = 0;
-        } else {
-            MP_INFO(vo, "Frame at pts %d got dropped "
-                    "entirely because pts went backwards\n", (int) frameipts);
-        }
-    }
+    AVFrame *frame = mp_image_to_av_frame(mpi);
+    if (!frame)
+        abort();
 
-done:
-    talloc_free(mpi);
-}
-
-static void draw_image(struct vo *vo, mp_image_t *mpi)
-{
-    pthread_mutex_lock(&vo->encode_lavc_ctx->lock);
-    draw_image_unlocked(vo, mpi);
-    pthread_mutex_unlock(&vo->encode_lavc_ctx->lock);
+    frame->pts = rint(outpts * av_q2d(av_inv_q(avc->time_base)));
+    frame->pict_type = 0; // keep this at unknown/undefined
+    frame->quality = avc->global_quality;
+    encoder_encode(enc, frame);
+    av_frame_free(&frame);
 }
 
 static void flip_page(struct vo *vo)
@@ -495,13 +240,15 @@ const struct vo_driver video_out_lavc = {
     .encode = true,
     .description = "video encoding using libavcodec",
     .name = "lavc",
+    .initially_blocked = true,
     .untimed = true,
+    .priv_size = sizeof(struct priv),
     .preinit = preinit,
     .query_format = query_format,
-    .reconfig = reconfig,
+    .reconfig2 = reconfig2,
     .control = control,
     .uninit = uninit,
-    .draw_image = draw_image,
+    .draw_frame = draw_frame,
     .flip_page = flip_page,
 };
 
diff --git a/video/out/vo_libmpv.c b/video/out/vo_libmpv.c
new file mode 100644
index 0000000..1df63a5
--- /dev/null
+++ b/video/out/vo_libmpv.c
@@ -0,0 +1,730 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <pthread.h>
+#include <assert.h>
+
+#include "config.h"
+
+#include "mpv_talloc.h"
+#include "common/common.h"
+#include "misc/bstr.h"
+#include "misc/dispatch.h"
+#include "common/msg.h"
+#include "options/m_config.h"
+#include "options/options.h"
+#include "aspect.h"
+#include "dr_helper.h"
+#include "vo.h"
+#include "video/mp_image.h"
+#include "sub/osd.h"
+#include "osdep/atomic.h"
+#include "osdep/timer.h"
+
+#include "common/global.h"
+#include "player/client.h"
+
+#include "libmpv.h"
+
+/*
+ * mpv_render_context is managed by the host application - the host application
+ * can access it any time, even if the VO is destroyed (or not created yet).
+ *
+ * - the libmpv user can mix render API and normal API; thus render API
+ *   functions can wait on the core, but not the reverse
+ * - the core does blocking calls into the VO thread, thus the VO functions
+ *   can't wait on the user calling the API functions
+ * - to make video timing work like it should, the VO thread waits on the
+ *   render API user anyway, and the (unlikely) deadlock is avoided with
+ *   a timeout
+ *
+ *  Locking:  mpv core > VO > mpv_render_context.lock > mp_client_api.lock
+ *              > mpv_render_context.update_lock
+ *  And: render thread > VO (wait for present)
+ *       VO > render thread (wait for present done, via timeout)
+ */
+
+struct vo_priv {
+    struct mpv_render_context *ctx; // immutable after init
+};
+
+struct mpv_render_context {
+    struct mp_log *log;
+    struct mpv_global *global;
+    struct mp_client_api *client_api;
+
+    atomic_bool in_use;
+
+    // --- Immutable after init
+    bool advanced_control;
+    struct mp_dispatch_queue *dispatch; // NULL if advanced_control disabled
+    struct dr_helper *dr;           // NULL if advanced_control disabled
+
+    pthread_mutex_t control_lock;
+    // --- Protected by control_lock
+    mp_render_cb_control_fn control_cb;
+    void *control_cb_ctx;
+
+    pthread_mutex_t update_lock;
+    pthread_cond_t update_cond;     // paired with update_lock
+
+    // --- Protected by update_lock
+    mpv_render_update_fn update_cb;
+    void *update_cb_ctx;
+    bool had_kill_update;           // update during termination
+
+    pthread_mutex_t lock;
+    pthread_cond_t video_wait;      // paired with lock
+
+    // --- Protected by lock
+    struct vo_frame *next_frame;    // next frame to draw
+    int64_t present_count;          // incremented when next frame can be shown
+    int64_t expected_flip_count;    // next vsync event for next_frame
+    bool redrawing;                 // next_frame was a redraw request
+    int64_t flip_count;
+    struct vo_frame *cur_frame;
+    struct mp_image_params img_params;
+    int vp_w, vp_h;
+    bool flip;
+    bool imgfmt_supported[IMGFMT_END - IMGFMT_START];
+    bool need_reconfig;
+    bool need_resize;
+    bool need_reset;
+    bool need_update_external;
+    struct vo *vo;
+
+    // --- Mostly immutable after init.
+    struct mp_hwdec_devices *hwdec_devs;
+
+    // --- All of these can only be accessed from mpv_render_*() API, for
+    //     which the user makes sure they're called synchronized.
+    struct render_backend *renderer;
+    struct m_config_cache *vo_opts_cache;
+    struct mp_vo_opts *vo_opts;
+};
+
+const struct render_backend_fns *render_backends[] = {
+    &render_backend_gpu,
+    NULL
+};
+
+static void update(struct mpv_render_context *ctx)
+{
+    pthread_mutex_lock(&ctx->update_lock);
+    if (ctx->update_cb)
+        ctx->update_cb(ctx->update_cb_ctx);
+
+    // For the termination code.
+    ctx->had_kill_update = true;
+    pthread_cond_broadcast(&ctx->update_cond);
+    pthread_mutex_unlock(&ctx->update_lock);
+}
+
+void *get_mpv_render_param(mpv_render_param *params, mpv_render_param_type type,
+                           void *def)
+{
+    for (int n = 0; params && params[n].type; n++) {
+        if (params[n].type == type)
+            return params[n].data;
+    }
+    return def;
+}
+
+static void forget_frames(struct mpv_render_context *ctx, bool all)
+{
+    pthread_cond_broadcast(&ctx->video_wait);
+    if (all) {
+        talloc_free(ctx->cur_frame);
+        ctx->cur_frame = NULL;
+    }
+}
+
+static void dispatch_wakeup(void *ptr)
+{
+    struct mpv_render_context *ctx = ptr;
+
+    update(ctx);
+}
+
+static struct mp_image *render_get_image(void *ptr, int imgfmt, int w, int h,
+                                         int stride_align)
+{
+    struct mpv_render_context *ctx = ptr;
+
+    return ctx->renderer->fns->get_image(ctx->renderer, imgfmt, w, h, stride_align);
+}
+
+int mpv_render_context_create(mpv_render_context **res, mpv_handle *mpv,
+                              mpv_render_param *params)
+{
+    mpv_render_context *ctx = talloc_zero(NULL, mpv_render_context);
+    pthread_mutex_init(&ctx->control_lock, NULL);
+    pthread_mutex_init(&ctx->lock, NULL);
+    pthread_mutex_init(&ctx->update_lock, NULL);
+    pthread_cond_init(&ctx->update_cond, NULL);
+    pthread_cond_init(&ctx->video_wait, NULL);
+
+    ctx->global = mp_client_get_global(mpv);
+    ctx->client_api = ctx->global->client_api;
+    ctx->log = mp_log_new(ctx, ctx->global->log, "libmpv_render");
+
+    ctx->vo_opts_cache = m_config_cache_alloc(ctx, ctx->global, &vo_sub_opts);
+    ctx->vo_opts = ctx->vo_opts_cache->opts;
+
+    if (GET_MPV_RENDER_PARAM(params, MPV_RENDER_PARAM_ADVANCED_CONTROL, int, 0)) {
+        ctx->advanced_control = true;
+        ctx->dispatch = mp_dispatch_create(ctx);
+        mp_dispatch_set_wakeup_fn(ctx->dispatch, dispatch_wakeup, ctx);
+    }
+
+    int err = MPV_ERROR_NOT_IMPLEMENTED;
+    for (int n = 0; render_backends[n]; n++) {
+        ctx->renderer = talloc_zero(NULL, struct render_backend);
+        *ctx->renderer = (struct render_backend){
+            .global = ctx->global,
+            .log = ctx->log,
+            .fns = render_backends[n],
+        };
+        err = ctx->renderer->fns->init(ctx->renderer, params);
+        if (err >= 0)
+            break;
+        ctx->renderer->fns->destroy(ctx->renderer);
+        talloc_free(ctx->renderer->priv);
+        TA_FREEP(&ctx->renderer);
+        if (err != MPV_ERROR_NOT_IMPLEMENTED)
+            break;
+    }
+
+    if (err < 0) {
+        mpv_render_context_free(ctx);
+        return err;
+    }
+
+    ctx->hwdec_devs = ctx->renderer->hwdec_devs;
+
+    for (int n = IMGFMT_START; n < IMGFMT_END; n++) {
+        ctx->imgfmt_supported[n - IMGFMT_START] =
+            ctx->renderer->fns->check_format(ctx->renderer, n);
+    }
+
+    if (ctx->renderer->fns->get_image && ctx->dispatch)
+        ctx->dr = dr_helper_create(ctx->dispatch, render_get_image, ctx);
+
+    if (!mp_set_main_render_context(ctx->client_api, ctx, true)) {
+        MP_ERR(ctx, "There is already a mpv_render_context set.\n");
+        mpv_render_context_free(ctx);
+        return MPV_ERROR_GENERIC;
+    }
+
+    *res = ctx;
+    return 0;
+}
+
+void mpv_render_context_set_update_callback(mpv_render_context *ctx,
+                                            mpv_render_update_fn callback,
+                                            void *callback_ctx)
+{
+    pthread_mutex_lock(&ctx->update_lock);
+    ctx->update_cb = callback;
+    ctx->update_cb_ctx = callback_ctx;
+    if (ctx->update_cb)
+        ctx->update_cb(ctx->update_cb_ctx);
+    pthread_mutex_unlock(&ctx->update_lock);
+}
+
+void mp_render_context_set_control_callback(mpv_render_context *ctx,
+                                            mp_render_cb_control_fn callback,
+                                            void *callback_ctx)
+{
+    pthread_mutex_lock(&ctx->control_lock);
+    ctx->control_cb = callback;
+    ctx->control_cb_ctx = callback_ctx;
+    pthread_mutex_unlock(&ctx->control_lock);
+}
+
+static void kill_cb(void *ptr)
+{
+    struct mpv_render_context *ctx = ptr;
+
+    pthread_mutex_lock(&ctx->update_lock);
+    ctx->had_kill_update = true;
+    pthread_cond_broadcast(&ctx->update_cond);
+    pthread_mutex_unlock(&ctx->update_lock);
+}
+
+void mpv_render_context_free(mpv_render_context *ctx)
+{
+    if (!ctx)
+        return;
+
+    // From here on, ctx becomes invisible and cannot be newly acquired. Only
+    // a VO could still hold a reference.
+    mp_set_main_render_context(ctx->client_api, ctx, false);
+
+    // If it's still in use, a VO using it must be active. Destroy the VO, and
+    // also bring down the decoder etc., which still might be using the hwdec
+    // context. The above removal guarantees it can't come back (so ctx->vo
+    // can't change to non-NULL).
+    if (atomic_load(&ctx->in_use)) {
+        kill_video_async(ctx->client_api, kill_cb, ctx);
+
+        while (atomic_load(&ctx->in_use)) {
+            // As long as the video decoders are not destroyed, they can still
+            // try to allocate new DR images and so on. This is a grotesque
+            // corner case, but possible. Also, more likely, DR images need to
+            // be released while the video chain is destroyed.
+            if (ctx->dispatch)
+                mp_dispatch_queue_process(ctx->dispatch, 0);
+
+            // Wait for kill_cb() or update() calls.
+            pthread_mutex_lock(&ctx->update_lock);
+            if (!ctx->had_kill_update)
+                pthread_cond_wait(&ctx->update_cond, &ctx->update_lock);
+            ctx->had_kill_update = false;
+            pthread_mutex_unlock(&ctx->update_lock);
+        }
+    }
+
+    assert(!atomic_load(&ctx->in_use));
+    assert(!ctx->vo);
+
+    // Possibly remaining outstanding work.
+    if (ctx->dispatch)
+        mp_dispatch_queue_process(ctx->dispatch, 0);
+
+    forget_frames(ctx, true);
+
+    ctx->renderer->fns->destroy(ctx->renderer);
+    talloc_free(ctx->renderer->priv);
+    talloc_free(ctx->renderer);
+    talloc_free(ctx->dr);
+    talloc_free(ctx->dispatch);
+
+    pthread_cond_destroy(&ctx->update_cond);
+    pthread_cond_destroy(&ctx->video_wait);
+    pthread_mutex_destroy(&ctx->update_lock);
+    pthread_mutex_destroy(&ctx->lock);
+    pthread_mutex_destroy(&ctx->control_lock);
+
+    talloc_free(ctx);
+}
+
+// Try to mark the context as "in exclusive use" (e.g. by a VO).
+// Note: the function must not acquire any locks, because it's called with an
+// external leaf lock held.
+bool mp_render_context_acquire(mpv_render_context *ctx)
+{
+    bool prev = false;
+    return atomic_compare_exchange_strong(&ctx->in_use, &prev, true);
+}
+
+int mpv_render_context_render(mpv_render_context *ctx, mpv_render_param *params)
+{
+    pthread_mutex_lock(&ctx->lock);
+
+    int do_render =
+        !GET_MPV_RENDER_PARAM(params, MPV_RENDER_PARAM_SKIP_RENDERING, int, 0);
+
+    if (do_render) {
+        int vp_w, vp_h;
+        int err = ctx->renderer->fns->get_target_size(ctx->renderer, params,
+                                                    &vp_w, &vp_h);
+        if (err < 0) {
+            pthread_mutex_unlock(&ctx->lock);
+            return err;
+        }
+
+        if (ctx->vo && (ctx->vp_w != vp_w || ctx->vp_h != vp_h ||
+                        ctx->need_resize))
+        {
+            ctx->vp_w = vp_w;
+            ctx->vp_h = vp_h;
+
+            m_config_cache_update(ctx->vo_opts_cache);
+
+            struct mp_rect src, dst;
+            struct mp_osd_res osd;
+            mp_get_src_dst_rects(ctx->log, ctx->vo_opts, ctx->vo->driver->caps,
+                                &ctx->img_params, vp_w, abs(vp_h),
+                                1.0, &src, &dst, &osd);
+
+            ctx->renderer->fns->resize(ctx->renderer, &src, &dst, &osd);
+        }
+        ctx->need_resize = false;
+    }
+
+    if (ctx->need_reconfig)
+        ctx->renderer->fns->reconfig(ctx->renderer, &ctx->img_params);
+    ctx->need_reconfig = false;
+
+    if (ctx->need_update_external)
+        ctx->renderer->fns->update_external(ctx->renderer, ctx->vo);
+    ctx->need_update_external = false;
+
+    if (ctx->need_reset) {
+        ctx->renderer->fns->reset(ctx->renderer);
+        if (ctx->cur_frame)
+            ctx->cur_frame->still = true;
+    }
+    ctx->need_reset = false;
+
+    struct vo_frame *frame = ctx->next_frame;
+    int64_t wait_present_count = ctx->present_count;
+    if (frame) {
+        ctx->next_frame = NULL;
+        if (!(frame->redraw || !frame->current))
+            wait_present_count += 1;
+        pthread_cond_broadcast(&ctx->video_wait);
+        talloc_free(ctx->cur_frame);
+        ctx->cur_frame = vo_frame_ref(frame);
+    } else {
+        frame = vo_frame_ref(ctx->cur_frame);
+        if (frame)
+            frame->redraw = true;
+        MP_STATS(ctx, "glcb-noframe");
+    }
+    struct vo_frame dummy = {0};
+    if (!frame)
+        frame = &dummy;
+
+    pthread_mutex_unlock(&ctx->lock);
+
+    MP_STATS(ctx, "glcb-render");
+
+    int err = 0;
+
+    if (do_render)
+        err = ctx->renderer->fns->render(ctx->renderer, params, frame);
+
+    if (frame != &dummy)
+        talloc_free(frame);
+
+    if (GET_MPV_RENDER_PARAM(params, MPV_RENDER_PARAM_BLOCK_FOR_TARGET_TIME,
+                             int, 1))
+    {
+        pthread_mutex_lock(&ctx->lock);
+        while (wait_present_count > ctx->present_count)
+            pthread_cond_wait(&ctx->video_wait, &ctx->lock);
+        pthread_mutex_unlock(&ctx->lock);
+    }
+
+    return err;
+}
+
+void mpv_render_context_report_swap(mpv_render_context *ctx)
+{
+    MP_STATS(ctx, "glcb-reportflip");
+
+    pthread_mutex_lock(&ctx->lock);
+    ctx->flip_count += 1;
+    pthread_cond_broadcast(&ctx->video_wait);
+    pthread_mutex_unlock(&ctx->lock);
+}
+
+uint64_t mpv_render_context_update(mpv_render_context *ctx)
+{
+    uint64_t res = 0;
+
+    if (ctx->dispatch)
+        mp_dispatch_queue_process(ctx->dispatch, 0);
+
+    pthread_mutex_lock(&ctx->lock);
+    if (ctx->next_frame)
+        res |= MPV_RENDER_UPDATE_FRAME;
+    pthread_mutex_unlock(&ctx->lock);
+    return res;
+}
+
+int mpv_render_context_set_parameter(mpv_render_context *ctx,
+                                     mpv_render_param param)
+{
+    return ctx->renderer->fns->set_parameter(ctx->renderer, param);
+}
+
+int mpv_render_context_get_info(mpv_render_context *ctx,
+                                mpv_render_param param)
+{
+    int res = MPV_ERROR_NOT_IMPLEMENTED;
+    pthread_mutex_lock(&ctx->lock);
+
+    switch (param.type) {
+    case MPV_RENDER_PARAM_NEXT_FRAME_INFO: {
+        mpv_render_frame_info *info = param.data;
+        *info = (mpv_render_frame_info){0};
+        struct vo_frame *frame = ctx->next_frame;
+        if (frame) {
+            info->flags =
+                MPV_RENDER_FRAME_INFO_PRESENT |
+                (frame->redraw ? MPV_RENDER_FRAME_INFO_REDRAW : 0) |
+                (frame->repeat ? MPV_RENDER_FRAME_INFO_REPEAT : 0) |
+                (frame->display_synced && !frame->redraw ?
+                    MPV_RENDER_FRAME_INFO_BLOCK_VSYNC : 0);
+            info->target_time = frame->pts;
+        }
+        res = 0;
+        break;
+    }
+    default:;
+    }
+
+    pthread_mutex_unlock(&ctx->lock);
+    return res;
+}
+
+static void draw_frame(struct vo *vo, struct vo_frame *frame)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+
+    pthread_mutex_lock(&ctx->lock);
+    assert(!ctx->next_frame);
+    ctx->next_frame = vo_frame_ref(frame);
+    ctx->expected_flip_count = ctx->flip_count + 1;
+    ctx->redrawing = frame->redraw || !frame->current;
+    pthread_mutex_unlock(&ctx->lock);
+
+    update(ctx);
+}
+
+static void flip_page(struct vo *vo)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+    struct timespec ts = mp_rel_time_to_timespec(0.2);
+
+    pthread_mutex_lock(&ctx->lock);
+
+    // Wait until frame was rendered
+    while (ctx->next_frame) {
+        if (pthread_cond_timedwait(&ctx->video_wait, &ctx->lock, &ts)) {
+            if (ctx->next_frame) {
+                MP_VERBOSE(vo, "mpv_render_context_render() not being called "
+                           "or stuck.\n");
+                goto done;
+            }
+        }
+    }
+
+    // Unblock mpv_render_context_render().
+    ctx->present_count += 1;
+    pthread_cond_broadcast(&ctx->video_wait);
+
+    if (ctx->redrawing)
+        goto done; // do not block for redrawing
+
+    // Wait until frame was presented
+    while (ctx->expected_flip_count > ctx->flip_count) {
+        // mpv_render_report_swap() is declared as optional API.
+        // Assume the user calls it consistently _if_ it's called at all.
+        if (!ctx->flip_count)
+            break;
+        if (pthread_cond_timedwait(&ctx->video_wait, &ctx->lock, &ts)) {
+            MP_VERBOSE(vo, "mpv_render_report_swap() not being called.\n");
+            goto done;
+        }
+    }
+
+done:
+
+    // Cleanup after the API user is not reacting, or is being unusually slow.
+    if (ctx->next_frame) {
+        talloc_free(ctx->cur_frame);
+        ctx->cur_frame = ctx->next_frame;
+        ctx->next_frame = NULL;
+        ctx->present_count += 2;
+        pthread_cond_signal(&ctx->video_wait);
+        vo_increment_drop_count(vo, 1);
+    }
+
+    pthread_mutex_unlock(&ctx->lock);
+}
+
+static int query_format(struct vo *vo, int format)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+
+    bool ok = false;
+    pthread_mutex_lock(&ctx->lock);
+    if (format >= IMGFMT_START && format < IMGFMT_END)
+        ok = ctx->imgfmt_supported[format - IMGFMT_START];
+    pthread_mutex_unlock(&ctx->lock);
+    return ok;
+}
+
+static void run_control_on_render_thread(void *p)
+{
+    void **args = p;
+    struct mpv_render_context *ctx = args[0];
+    int request = (intptr_t)args[1];
+    void *data = args[2];
+    int ret = VO_NOTIMPL;
+
+    switch (request) {
+    case VOCTRL_SCREENSHOT: {
+        pthread_mutex_lock(&ctx->lock);
+        struct vo_frame *frame = vo_frame_ref(ctx->cur_frame);
+        pthread_mutex_unlock(&ctx->lock);
+        if (frame && ctx->renderer->fns->screenshot)
+            ctx->renderer->fns->screenshot(ctx->renderer, frame, data);
+        talloc_free(frame);
+        break;
+    }
+    }
+
+    *(int *)args[3] = ret;
+}
+
+static int control(struct vo *vo, uint32_t request, void *data)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+
+    switch (request) {
+    case VOCTRL_RESET:
+        pthread_mutex_lock(&ctx->lock);
+        forget_frames(ctx, false);
+        ctx->need_reset = true;
+        pthread_mutex_unlock(&ctx->lock);
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_PAUSE:
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_SET_EQUALIZER:
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_SET_PANSCAN:
+        pthread_mutex_lock(&ctx->lock);
+        ctx->need_resize = true;
+        pthread_mutex_unlock(&ctx->lock);
+        vo->want_redraw = true;
+        return VO_TRUE;
+    case VOCTRL_UPDATE_RENDER_OPTS:
+        pthread_mutex_lock(&ctx->lock);
+        ctx->need_update_external = true;
+        pthread_mutex_unlock(&ctx->lock);
+        vo->want_redraw = true;
+        return VO_TRUE;
+    }
+
+    // VOCTRLs to be run on the renderer thread (if possible at all).
+    switch (request) {
+    case VOCTRL_SCREENSHOT:
+        if (ctx->dispatch) {
+            int ret;
+            void *args[] = {ctx, (void *)(intptr_t)request, data, &ret};
+            mp_dispatch_run(ctx->dispatch, run_control_on_render_thread, args);
+            return ret;
+        }
+    }
+
+    int r = VO_NOTIMPL;
+    pthread_mutex_lock(&ctx->control_lock);
+    if (ctx->control_cb) {
+        int events = 0;
+        r = p->ctx->control_cb(vo, p->ctx->control_cb_ctx,
+                               &events, request, data);
+        vo_event(vo, events);
+    }
+    pthread_mutex_unlock(&ctx->control_lock);
+
+    return r;
+}
+
+static struct mp_image *get_image(struct vo *vo, int imgfmt, int w, int h,
+                                  int stride_align)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+
+    if (ctx->dr)
+        return dr_helper_get_image(ctx->dr, imgfmt, w, h, stride_align);
+
+    return NULL;
+}
+
+static int reconfig(struct vo *vo, struct mp_image_params *params)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+
+    pthread_mutex_lock(&ctx->lock);
+    forget_frames(ctx, true);
+    ctx->img_params = *params;
+    ctx->need_reconfig = true;
+    ctx->need_resize = true;
+    pthread_mutex_unlock(&ctx->lock);
+
+    control(vo, VOCTRL_RECONFIG, NULL);
+
+    return 0;
+}
+
+static void uninit(struct vo *vo)
+{
+    struct vo_priv *p = vo->priv;
+    struct mpv_render_context *ctx = p->ctx;
+
+    control(vo, VOCTRL_UNINIT, NULL);
+
+    pthread_mutex_lock(&ctx->lock);
+
+    forget_frames(ctx, true);
+    ctx->img_params = (struct mp_image_params){0};
+    ctx->need_reconfig = true;
+    ctx->need_resize = true;
+    ctx->need_update_external = true;
+    ctx->need_reset = true;
+    ctx->vo = NULL;
+    pthread_mutex_unlock(&ctx->lock);
+
+    bool state = atomic_exchange(&ctx->in_use, false);
+    assert(state); // obviously must have been set
+
+    update(ctx);
+}
+
+static int preinit(struct vo *vo)
+{
+    struct vo_priv *p = vo->priv;
+
+    struct mpv_render_context *ctx =
+        mp_client_api_acquire_render_context(vo->global->client_api);
+    p->ctx = ctx;
+
+    if (!ctx) {
+        if (!vo->probing)
+            MP_FATAL(vo, "No render context set.\n");
+        return -1;
+    }
+
+    pthread_mutex_lock(&ctx->lock);
+    ctx->vo = vo;
+    ctx->need_resize = true;
+    ctx->need_update_external = true;
+    pthread_mutex_unlock(&ctx->lock);
+
+    vo->hwdec_devs = ctx->hwdec_devs;
+    control(vo, VOCTRL_PREINIT, NULL);
+
+    return 0;
+}
+
+const struct vo_driver video_out_libmpv = {
+    .description = "render API for libmpv",
+    .name = "libmpv",
+    .caps = VO_CAP_ROTATE90,
+    .preinit = preinit,
+    .query_format = query_format,
+    .reconfig = reconfig,
+    .control = control,
+    .get_image_ts = get_image,
+    .draw_frame = draw_frame,
+    .flip_page = flip_page,
+    .uninit = uninit,
+    .priv_size = sizeof(struct vo_priv),
+};
diff --git a/video/out/vo_mediacodec_embed.c b/video/out/vo_mediacodec_embed.c
index 63975e9..dc29749 100644
--- a/video/out/vo_mediacodec_embed.c
+++ b/video/out/vo_mediacodec_embed.c
@@ -107,7 +107,7 @@ static void uninit(struct vo *vo)
 const struct vo_driver video_out_mediacodec_embed = {
     .description = "Android (Embedded MediaCodec Surface)",
     .name = "mediacodec_embed",
-    .caps = VO_CAP_NOREDRAW,
+    .caps = VO_CAP_NORETAIN,
     .preinit = preinit,
     .query_format = query_format,
     .control = control,
diff --git a/video/out/vo_opengl_cb.c b/video/out/vo_opengl_cb.c
deleted file mode 100644
index c8dab15..0000000
--- a/video/out/vo_opengl_cb.c
+++ /dev/null
@@ -1,542 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <pthread.h>
-#include <assert.h>
-
-#include "config.h"
-
-#include "mpv_talloc.h"
-#include "common/common.h"
-#include "misc/bstr.h"
-#include "common/msg.h"
-#include "options/m_config.h"
-#include "options/options.h"
-#include "aspect.h"
-#include "vo.h"
-#include "video/mp_image.h"
-#include "sub/osd.h"
-#include "osdep/timer.h"
-
-#include "common/global.h"
-#include "player/client.h"
-
-#include "gpu/video.h"
-#include "gpu/hwdec.h"
-#include "opengl/common.h"
-#include "opengl/context.h"
-#include "opengl/ra_gl.h"
-
-#include "libmpv/opengl_cb.h"
-
-/*
- * mpv_opengl_cb_context is created by the host application - the host application
- * can access it any time, even if the VO is destroyed (or not created yet).
- * The OpenGL object allows initializing the renderer etc. The VO object is only
- * here to transfer the video frames somehow.
- *
- * Locking hierarchy:
- * - the libmpv user can mix openglcb and normal API; thus openglcb API
- *   functions can wait on the core, but not the reverse
- * - the core does blocking calls into the VO thread, thus the VO functions
- *   can't wait on the user calling the API functions
- * - to make video timing work like it should, the VO thread waits on the
- *   openglcb API user anyway, and the (unlikely) deadlock is avoided with
- *   a timeout
- */
-
-struct vo_priv {
-    struct mpv_opengl_cb_context *ctx;
-};
-
-struct mpv_opengl_cb_context {
-    struct mp_log *log;
-    struct mpv_global *global;
-    struct mp_client_api *client_api;
-
-    pthread_mutex_t lock;
-    pthread_cond_t wakeup;
-
-    // --- Protected by lock
-    bool initialized;
-    mpv_opengl_cb_update_fn update_cb;
-    void *update_cb_ctx;
-    struct vo_frame *next_frame;    // next frame to draw
-    int64_t present_count;          // incremented when next frame can be shown
-    int64_t expected_flip_count;    // next vsync event for next_frame
-    bool redrawing;                 // next_frame was a redraw request
-    int64_t flip_count;
-    struct vo_frame *cur_frame;
-    struct mp_image_params img_params;
-    bool reconfigured, reset;
-    int vp_w, vp_h;
-    bool flip;
-    bool force_update;
-    bool imgfmt_supported[IMGFMT_END - IMGFMT_START];
-    bool update_new_opts;
-    struct vo *active;
-
-    // --- This is only mutable while initialized=false, during which nothing
-    //     except the OpenGL context manager is allowed to access it.
-    struct mp_hwdec_devices *hwdec_devs;
-
-    // --- All of these can only be accessed from the thread where the host
-    //     application's OpenGL context is current - i.e. only while the
-    //     host application is calling certain mpv_opengl_cb_* APIs.
-    GL *gl;
-    struct ra_ctx *ra_ctx;
-    struct gl_video *renderer;
-    struct m_config_cache *vo_opts_cache;
-    struct mp_vo_opts *vo_opts;
-};
-
-static void update(struct vo_priv *p);
-
-static void forget_frames(struct mpv_opengl_cb_context *ctx, bool all)
-{
-    pthread_cond_broadcast(&ctx->wakeup);
-    if (all) {
-        talloc_free(ctx->cur_frame);
-        ctx->cur_frame = NULL;
-    }
-}
-
-static void free_ctx(void *ptr)
-{
-    mpv_opengl_cb_context *ctx = ptr;
-
-    // This can trigger if the client API user doesn't call
-    // mpv_opengl_cb_uninit_gl() properly.
-    assert(!ctx->initialized);
-
-    pthread_cond_destroy(&ctx->wakeup);
-    pthread_mutex_destroy(&ctx->lock);
-}
-
-struct mpv_opengl_cb_context *mp_opengl_create(struct mpv_global *g,
-                                               struct mp_client_api *client_api)
-{
-    mpv_opengl_cb_context *ctx = talloc_zero(NULL, mpv_opengl_cb_context);
-    talloc_set_destructor(ctx, free_ctx);
-    pthread_mutex_init(&ctx->lock, NULL);
-    pthread_cond_init(&ctx->wakeup, NULL);
-
-    ctx->global = g;
-    ctx->log = mp_log_new(ctx, g->log, "opengl-cb");
-    ctx->client_api = client_api;
-
-    ctx->vo_opts_cache = m_config_cache_alloc(ctx, ctx->global, &vo_sub_opts);
-    ctx->vo_opts = ctx->vo_opts_cache->opts;
-
-    return ctx;
-}
-
-void mpv_opengl_cb_set_update_callback(struct mpv_opengl_cb_context *ctx,
-                                      mpv_opengl_cb_update_fn callback,
-                                      void *callback_ctx)
-{
-    pthread_mutex_lock(&ctx->lock);
-    ctx->update_cb = callback;
-    ctx->update_cb_ctx = callback_ctx;
-    pthread_mutex_unlock(&ctx->lock);
-}
-
-// Reset some GL attributes the user might clobber. For mid-term compatibility
-// only - we expect both user code and our code to do this correctly.
-static void reset_gl_state(GL *gl)
-{
-    gl->ActiveTexture(GL_TEXTURE0);
-    if (gl->mpgl_caps & MPGL_CAP_ROW_LENGTH)
-        gl->PixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-    gl->PixelStorei(GL_UNPACK_ALIGNMENT, 4);
-}
-
-int mpv_opengl_cb_init_gl(struct mpv_opengl_cb_context *ctx, const char *exts,
-                          mpv_opengl_cb_get_proc_address_fn get_proc_address,
-                          void *get_proc_address_ctx)
-{
-    if (ctx->renderer)
-        return MPV_ERROR_INVALID_PARAMETER;
-
-    talloc_free(ctx->gl);
-    ctx->gl = talloc_zero(ctx, GL);
-
-    mpgl_load_functions2(ctx->gl, get_proc_address, get_proc_address_ctx,
-                         exts, ctx->log);
-    if (!ctx->gl->version && !ctx->gl->es) {
-        MP_FATAL(ctx, "OpenGL not initialized.\n");
-        return MPV_ERROR_UNSUPPORTED;
-    }
-
-    // initialize a blank ra_ctx to reuse ra_gl_ctx
-    ctx->ra_ctx = talloc_zero(ctx, struct ra_ctx);
-    ctx->ra_ctx->log = ctx->log;
-    ctx->ra_ctx->global = ctx->global;
-    ctx->ra_ctx->opts = (struct ra_ctx_opts) {
-        .probing = false,
-        .allow_sw = true,
-    };
-
-    static const struct ra_swapchain_fns empty_swapchain_fns = {0};
-    struct ra_gl_ctx_params gl_params = {
-        // vo_opengl_cb is essentially like a gigantic external swapchain where
-        // the user is in charge of presentation / swapping etc. But we don't
-        // actually need to provide any of these functions, since we can just
-        // not call them to begin with - so just set it to an empty object to
-        // signal to ra_gl_ctx that we don't care about its latency emulation
-        // functionality
-        .external_swapchain = &empty_swapchain_fns
-    };
-
-    ctx->gl->SwapInterval = NULL; // we shouldn't randomly change this, so lock it
-    if (!ra_gl_ctx_init(ctx->ra_ctx, ctx->gl, gl_params))
-        return MPV_ERROR_UNSUPPORTED;
-
-    ctx->renderer = gl_video_init(ctx->ra_ctx->ra, ctx->log, ctx->global);
-
-    ctx->hwdec_devs = hwdec_devices_create();
-    gl_video_load_hwdecs(ctx->renderer, ctx->hwdec_devs, true);
-
-    pthread_mutex_lock(&ctx->lock);
-    for (int n = IMGFMT_START; n < IMGFMT_END; n++) {
-        ctx->imgfmt_supported[n - IMGFMT_START] =
-            gl_video_check_format(ctx->renderer, n);
-    }
-    ctx->initialized = true;
-    pthread_mutex_unlock(&ctx->lock);
-
-    reset_gl_state(ctx->gl);
-    return 0;
-}
-
-int mpv_opengl_cb_uninit_gl(struct mpv_opengl_cb_context *ctx)
-{
-    if (!ctx)
-        return 0;
-
-    // Bring down the decoder etc., which still might be using the hwdec
-    // context. Setting initialized=false guarantees it can't come back.
-
-    pthread_mutex_lock(&ctx->lock);
-    forget_frames(ctx, true);
-    ctx->initialized = false;
-    pthread_mutex_unlock(&ctx->lock);
-
-    kill_video(ctx->client_api);
-
-    pthread_mutex_lock(&ctx->lock);
-    assert(!ctx->active);
-    pthread_mutex_unlock(&ctx->lock);
-
-    gl_video_uninit(ctx->renderer);
-    ctx->renderer = NULL;
-    hwdec_devices_destroy(ctx->hwdec_devs);
-    ctx->hwdec_devs = NULL;
-    ra_gl_ctx_uninit(ctx->ra_ctx);
-    talloc_free(ctx->ra_ctx);
-    talloc_free(ctx->gl);
-    ctx->ra_ctx = NULL;
-    ctx->gl = NULL;
-    return 0;
-}
-
-int mpv_opengl_cb_draw(mpv_opengl_cb_context *ctx, int fbo, int vp_w, int vp_h)
-{
-    assert(ctx->renderer);
-
-    if (fbo && !(ctx->gl->mpgl_caps & MPGL_CAP_FB)) {
-        MP_FATAL(ctx, "Rendering to FBO requested, but no FBO extension found!\n");
-        return MPV_ERROR_UNSUPPORTED;
-    }
-
-    reset_gl_state(ctx->gl);
-
-    pthread_mutex_lock(&ctx->lock);
-
-    struct vo *vo = ctx->active;
-
-    ctx->force_update |= ctx->reconfigured;
-
-    if (ctx->vp_w != vp_w || ctx->vp_h != vp_h)
-        ctx->force_update = true;
-
-    if (ctx->force_update && vo) {
-        ctx->force_update = false;
-        ctx->vp_w = vp_w;
-        ctx->vp_h = vp_h;
-
-        m_config_cache_update(ctx->vo_opts_cache);
-
-        struct mp_rect src, dst;
-        struct mp_osd_res osd;
-        mp_get_src_dst_rects(ctx->log, ctx->vo_opts, vo->driver->caps,
-                             &ctx->img_params, vp_w, abs(vp_h),
-                             1.0, &src, &dst, &osd);
-
-        gl_video_resize(ctx->renderer, &src, &dst, &osd);
-    }
-
-    if (ctx->reconfigured) {
-        gl_video_set_osd_source(ctx->renderer, vo ? vo->osd : NULL);
-        gl_video_config(ctx->renderer, &ctx->img_params);
-    }
-    if (ctx->update_new_opts) {
-        if (vo)
-            gl_video_configure_queue(ctx->renderer, vo);
-        int debug;
-        mp_read_option_raw(ctx->global, "gpu-debug", &m_option_type_flag,
-                           &debug);
-        ctx->gl->debug_context = debug;
-        ra_gl_set_debug(ctx->ra_ctx->ra, debug);
-        if (gl_video_icc_auto_enabled(ctx->renderer))
-            MP_ERR(ctx, "icc-profile-auto is not available with opengl-cb\n");
-    }
-    ctx->reconfigured = false;
-    ctx->update_new_opts = false;
-
-    if (ctx->reset) {
-        gl_video_reset(ctx->renderer);
-        ctx->reset = false;
-        if (ctx->cur_frame)
-            ctx->cur_frame->still = true;
-    }
-
-    struct vo_frame *frame = ctx->next_frame;
-    int64_t wait_present_count = ctx->present_count;
-    if (frame) {
-        ctx->next_frame = NULL;
-        if (!(frame->redraw || !frame->current))
-            wait_present_count += 1;
-        pthread_cond_signal(&ctx->wakeup);
-        talloc_free(ctx->cur_frame);
-        ctx->cur_frame = vo_frame_ref(frame);
-    } else {
-        frame = vo_frame_ref(ctx->cur_frame);
-        if (frame)
-            frame->redraw = true;
-        MP_STATS(ctx, "glcb-noframe");
-    }
-    struct vo_frame dummy = {0};
-    if (!frame)
-        frame = &dummy;
-
-    pthread_mutex_unlock(&ctx->lock);
-
-    MP_STATS(ctx, "glcb-render");
-    struct ra_swapchain *sw = ctx->ra_ctx->swapchain;
-    struct ra_fbo target;
-    ra_gl_ctx_resize(sw, vp_w, abs(vp_h), fbo);
-    ra_gl_ctx_start_frame(sw, &target);
-    target.flip = vp_h < 0;
-    gl_video_render_frame(ctx->renderer, frame, target);
-    ra_gl_ctx_submit_frame(sw, frame);
-
-    reset_gl_state(ctx->gl);
-
-    if (frame != &dummy)
-        talloc_free(frame);
-
-    pthread_mutex_lock(&ctx->lock);
-    while (wait_present_count > ctx->present_count)
-        pthread_cond_wait(&ctx->wakeup, &ctx->lock);
-    pthread_mutex_unlock(&ctx->lock);
-
-    return 0;
-}
-
-int mpv_opengl_cb_report_flip(mpv_opengl_cb_context *ctx, int64_t time)
-{
-    MP_STATS(ctx, "glcb-reportflip");
-
-    pthread_mutex_lock(&ctx->lock);
-    ctx->flip_count += 1;
-    pthread_cond_signal(&ctx->wakeup);
-    pthread_mutex_unlock(&ctx->lock);
-
-    return 0;
-}
-
-// Called locked.
-static void update(struct vo_priv *p)
-{
-    if (p->ctx->update_cb)
-        p->ctx->update_cb(p->ctx->update_cb_ctx);
-}
-
-static void draw_frame(struct vo *vo, struct vo_frame *frame)
-{
-    struct vo_priv *p = vo->priv;
-
-    pthread_mutex_lock(&p->ctx->lock);
-    assert(!p->ctx->next_frame);
-    p->ctx->next_frame = vo_frame_ref(frame);
-    p->ctx->expected_flip_count = p->ctx->flip_count + 1;
-    p->ctx->redrawing = frame->redraw || !frame->current;
-    update(p);
-    pthread_mutex_unlock(&p->ctx->lock);
-}
-
-static void flip_page(struct vo *vo)
-{
-    struct vo_priv *p = vo->priv;
-    struct timespec ts = mp_rel_time_to_timespec(0.2);
-
-    pthread_mutex_lock(&p->ctx->lock);
-
-    // Wait until frame was rendered
-    while (p->ctx->next_frame) {
-        if (pthread_cond_timedwait(&p->ctx->wakeup, &p->ctx->lock, &ts)) {
-            if (p->ctx->next_frame) {
-                MP_VERBOSE(vo, "mpv_opengl_cb_draw() not being called or stuck.\n");
-                goto done;
-            }
-        }
-    }
-
-    // Unblock mpv_opengl_cb_draw().
-    p->ctx->present_count += 1;
-    pthread_cond_signal(&p->ctx->wakeup);
-
-    if (p->ctx->redrawing)
-        goto done; // do not block for redrawing
-
-    // Wait until frame was presented
-    while (p->ctx->expected_flip_count > p->ctx->flip_count) {
-        // mpv_opengl_cb_report_flip() is declared as optional API.
-        // Assume the user calls it consistently _if_ it's called at all.
-        if (!p->ctx->flip_count)
-            break;
-        if (pthread_cond_timedwait(&p->ctx->wakeup, &p->ctx->lock, &ts)) {
-            MP_VERBOSE(vo, "mpv_opengl_cb_report_flip() not being called.\n");
-            goto done;
-        }
-    }
-
-done:
-
-    // Cleanup after the API user is not reacting, or is being unusually slow.
-    if (p->ctx->next_frame) {
-        talloc_free(p->ctx->cur_frame);
-        p->ctx->cur_frame = p->ctx->next_frame;
-        p->ctx->next_frame = NULL;
-        p->ctx->present_count += 2;
-        pthread_cond_signal(&p->ctx->wakeup);
-        vo_increment_drop_count(vo, 1);
-    }
-
-    pthread_mutex_unlock(&p->ctx->lock);
-}
-
-static int query_format(struct vo *vo, int format)
-{
-    struct vo_priv *p = vo->priv;
-
-    bool ok = false;
-    pthread_mutex_lock(&p->ctx->lock);
-    if (format >= IMGFMT_START && format < IMGFMT_END)
-        ok = p->ctx->imgfmt_supported[format - IMGFMT_START];
-    pthread_mutex_unlock(&p->ctx->lock);
-    return ok;
-}
-
-static int reconfig(struct vo *vo, struct mp_image_params *params)
-{
-    struct vo_priv *p = vo->priv;
-
-    pthread_mutex_lock(&p->ctx->lock);
-    forget_frames(p->ctx, true);
-    p->ctx->img_params = *params;
-    p->ctx->reconfigured = true;
-    pthread_mutex_unlock(&p->ctx->lock);
-
-    return 0;
-}
-
-static int control(struct vo *vo, uint32_t request, void *data)
-{
-    struct vo_priv *p = vo->priv;
-
-    switch (request) {
-    case VOCTRL_RESET:
-        pthread_mutex_lock(&p->ctx->lock);
-        forget_frames(p->ctx, false);
-        p->ctx->reset = true;
-        pthread_mutex_unlock(&p->ctx->lock);
-        return VO_TRUE;
-    case VOCTRL_PAUSE:
-        vo->want_redraw = true;
-        return VO_TRUE;
-    case VOCTRL_SET_EQUALIZER:
-        vo->want_redraw = true;
-        return VO_TRUE;
-    case VOCTRL_SET_PANSCAN:
-        pthread_mutex_lock(&p->ctx->lock);
-        p->ctx->force_update = true;
-        update(p);
-        pthread_mutex_unlock(&p->ctx->lock);
-        return VO_TRUE;
-    case VOCTRL_UPDATE_RENDER_OPTS:
-        pthread_mutex_lock(&p->ctx->lock);
-        p->ctx->update_new_opts = true;
-        update(p);
-        pthread_mutex_unlock(&p->ctx->lock);
-        return VO_TRUE;
-    }
-
-    return VO_NOTIMPL;
-}
-
-static void uninit(struct vo *vo)
-{
-    struct vo_priv *p = vo->priv;
-
-    pthread_mutex_lock(&p->ctx->lock);
-    forget_frames(p->ctx, true);
-    p->ctx->img_params = (struct mp_image_params){0};
-    p->ctx->reconfigured = true;
-    p->ctx->active = NULL;
-    update(p);
-    pthread_mutex_unlock(&p->ctx->lock);
-}
-
-static int preinit(struct vo *vo)
-{
-    struct vo_priv *p = vo->priv;
-    p->ctx = vo->extra.opengl_cb_context;
-    if (!p->ctx) {
-        MP_FATAL(vo, "No context set.\n");
-        return -1;
-    }
-
-    pthread_mutex_lock(&p->ctx->lock);
-    if (!p->ctx->initialized) {
-        MP_FATAL(vo, "OpenGL context not initialized.\n");
-        pthread_mutex_unlock(&p->ctx->lock);
-        return -1;
-    }
-    p->ctx->active = vo;
-    p->ctx->reconfigured = true;
-    p->ctx->update_new_opts = true;
-    pthread_mutex_unlock(&p->ctx->lock);
-
-    vo->hwdec_devs = p->ctx->hwdec_devs;
-
-    return 0;
-}
-
-const struct vo_driver video_out_opengl_cb = {
-    .description = "OpenGL Callbacks for libmpv",
-    .name = "opengl-cb",
-    .caps = VO_CAP_ROTATE90,
-    .preinit = preinit,
-    .query_format = query_format,
-    .reconfig = reconfig,
-    .control = control,
-    .draw_frame = draw_frame,
-    .flip_page = flip_page,
-    .uninit = uninit,
-    .priv_size = sizeof(struct vo_priv),
-};
diff --git a/video/out/vo_rpi.c b/video/out/vo_rpi.c
index 4322a3f..2065151 100644
--- a/video/out/vo_rpi.c
+++ b/video/out/vo_rpi.c
@@ -266,7 +266,7 @@ static void update_osd(struct vo *vo)
         .flip = true,
     };
     gl_video_set_osd_pts(p->gl_video, p->osd_pts);
-    gl_video_render_frame(p->gl_video, &frame, target);
+    gl_video_render_frame(p->gl_video, &frame, target, RENDER_FRAME_DEF);
     ra_tex_free(p->egl.ra, &target.tex);
 
     MP_STATS(vo, "stop rpi_osd");
diff --git a/video/out/vo_sdl.c b/video/out/vo_sdl.c
index 1667b2c..a7450e5 100644
--- a/video/out/vo_sdl.c
+++ b/video/out/vo_sdl.c
@@ -180,6 +180,7 @@ struct priv {
     int brightness, contrast;
     char *window_title;
     Uint32 wakeup_event;
+    bool screensaver_enabled;
 
     // options
     int allow_sw;
@@ -402,10 +403,22 @@ static void check_resize(struct vo *vo)
         resize(vo, w, h);
 }
 
+static inline void set_screensaver(bool enabled)
+{
+    if (!!enabled == !!SDL_IsScreenSaverEnabled())
+        return;
+
+    if (enabled)
+        SDL_EnableScreenSaver();
+    else
+        SDL_DisableScreenSaver();
+}
+
 static void set_fullscreen(struct vo *vo)
 {
     struct priv *vc = vo->priv;
     int fs = vo->opts->fullscreen;
+    SDL_bool prev_screensaver_state = SDL_IsScreenSaverEnabled();
 
     Uint32 fs_flag;
     if (vc->switch_mode)
@@ -428,7 +441,7 @@ static void set_fullscreen(struct vo *vo)
     }
 
     // toggling fullscreen might recreate the window, so better guard for this
-    SDL_DisableScreenSaver();
+    set_screensaver(prev_screensaver_state);
 
     force_resize(vo);
 }
@@ -507,8 +520,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
 
     resize(vo, win_w, win_h);
 
-    SDL_DisableScreenSaver();
-
+    set_screensaver(vc->screensaver_enabled);
     set_fullscreen(vo);
 
     SDL_ShowWindow(vc->window);
@@ -917,6 +929,14 @@ static int control(struct vo *vo, uint32_t request, void *data)
     case VOCTRL_SET_CURSOR_VISIBILITY:
         SDL_ShowCursor(*(bool *)data);
         return true;
+    case VOCTRL_KILL_SCREENSAVER:
+        vc->screensaver_enabled = false;
+        set_screensaver(vc->screensaver_enabled);
+        return VO_TRUE;
+    case VOCTRL_RESTORE_SCREENSAVER:
+        vc->screensaver_enabled = true;
+        set_screensaver(vc->screensaver_enabled);
+        return VO_TRUE;
     case VOCTRL_UPDATE_WINDOW_TITLE:
         talloc_free(vc->window_title);
         vc->window_title = talloc_strdup(vc, (char *)data);
@@ -936,6 +956,7 @@ const struct vo_driver video_out_sdl = {
     .priv_defaults = &(const struct priv) {
         .renderer_index = -1,
         .vsync = 1,
+        .screensaver_enabled = false,
     },
     .options = (const struct m_option []){
         OPT_FLAG("sw", allow_sw, 0),
diff --git a/video/out/vo_tct.c b/video/out/vo_tct.c
index dbe5d69..6a07786 100644
--- a/video/out/vo_tct.c
+++ b/video/out/vo_tct.c
@@ -207,7 +207,7 @@ static int reconfig(struct vo *vo, struct mp_image_params *params)
     if (p->buffer)
         free(p->buffer);
 
-    mp_sws_set_from_cmdline(p->sws, vo->opts->sws_opts);
+    mp_sws_set_from_cmdline(p->sws, vo->global);
     p->sws->src = *params;
     p->sws->dst = (struct mp_image_params) {
         .imgfmt = IMGFMT,
diff --git a/video/out/vo_vaapi.c b/video/out/vo_vaapi.c
index a3f7015..c8ffffc 100644
--- a/video/out/vo_vaapi.c
+++ b/video/out/vo_vaapi.c
@@ -816,7 +816,7 @@ static int preinit(struct vo *vo)
     if (!p->image_formats)
         goto fail;
 
-    p->pool = mp_image_pool_new(MAX_OUTPUT_SURFACES + 3);
+    p->pool = mp_image_pool_new(p);
     va_pool_set_allocator(p->pool, p->mpvaapi, VA_RT_FORMAT_YUV420);
 
     int max_subpic_formats = vaMaxNumSubpictureFormats(p->display);
diff --git a/video/out/vo_vdpau.c b/video/out/vo_vdpau.c
index ada3fb8..9871355 100644
--- a/video/out/vo_vdpau.c
+++ b/video/out/vo_vdpau.c
@@ -86,6 +86,7 @@ struct vdpctx {
     int                                current_duration;
 
     int                                output_surface_w, output_surface_h;
+    int                                rotation;
 
     int                                force_yuv;
     struct mp_vdpau_mixer             *video_mixer;
@@ -244,8 +245,7 @@ static void forget_frames(struct vo *vo, bool seek_reset)
 static int s_size(int max, int s, int disp)
 {
     disp = MPMAX(1, disp);
-    s += s / 2;
-    return MPMIN(max, s >= disp ? s : disp);
+    return MPMIN(max, MPMAX(s, disp));
 }
 
 static void resize(struct vo *vo)
@@ -285,7 +285,9 @@ static void resize(struct vo *vo)
                          1000LL * vc->flip_offset_window;
     vo_set_queue_params(vo, vc->flip_offset_us, 1);
 
-    if (vc->output_surface_w < vo->dwidth || vc->output_surface_h < vo->dheight) {
+    if (vc->output_surface_w < vo->dwidth || vc->output_surface_h < vo->dheight ||
+        vc->rotation != vo->params->rotate)
+    {
         vc->output_surface_w = s_size(max_w, vc->output_surface_w, vo->dwidth);
         vc->output_surface_h = s_size(max_h, vc->output_surface_h, vo->dheight);
         // Creation of output_surfaces
@@ -309,6 +311,7 @@ static void resize(struct vo *vo)
             vdp_st = vdp->output_surface_destroy(vc->rotation_surface);
             CHECK_VDP_WARNING(vo, "Error when calling "
                               "vdp_output_surface_destroy");
+            vc->rotation_surface = VDP_INVALID_HANDLE;
         }
         if (vo->params->rotate == 90 || vo->params->rotate == 270) {
             vdp_st = vdp->output_surface_create(vc->vdp_device,
@@ -327,6 +330,7 @@ static void resize(struct vo *vo)
         MP_DBG(vo, "vdpau rotation surface create: %u\n",
                vc->rotation_surface);
     }
+    vc->rotation = vo->params->rotate;
     vo->want_redraw = true;
 }
 
@@ -1027,7 +1031,7 @@ static int preinit(struct vo *vo)
 
     if (mp_vdpau_guess_if_emulated(vc->mpvdp)) {
         MP_WARN(vo, "VDPAU is most likely emulated via VA-API.\n"
-                    "This is inefficient. Use --vo=opengl instead.\n");
+                    "This is inefficient. Use --vo=gpu instead.\n");
     }
 
     // Mark everything as invalid first so uninit() can tell what has been
diff --git a/video/out/vo_x11.c b/video/out/vo_x11.c
index f29d06a..13b22d1 100644
--- a/video/out/vo_x11.c
+++ b/video/out/vo_x11.c
@@ -28,7 +28,6 @@
 #include "vo.h"
 #include "video/csputils.h"
 #include "video/mp_image.h"
-#include "video/filter/vf.h"
 
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
@@ -245,7 +244,7 @@ static bool resize(struct vo *vo)
         return -1;
     }
 
-    mp_sws_set_from_cmdline(p->sws, vo->opts->sws_opts);
+    mp_sws_set_from_cmdline(p->sws, vo->global);
     p->sws->dst = (struct mp_image_params) {
         .imgfmt = fmte->mpfmt,
         .w = p->dst_w,
diff --git a/video/out/vulkan/common.h b/video/out/vulkan/common.h
index 6e82bfa..1a4c3b8 100644
--- a/video/out/vulkan/common.h
+++ b/video/out/vulkan/common.h
@@ -48,11 +48,29 @@ struct mpvk_ctx {
     VkSurfaceKHR surf;
     VkSurfaceFormatKHR surf_format; // picked at surface initialization time
 
-    struct vk_malloc *alloc; // memory allocator for this device
-    struct vk_cmdpool *pool; // primary command pool for this device
-    struct vk_cmd *last_cmd; // most recently submitted command
+    struct vk_malloc *alloc;      // memory allocator for this device
     struct spirv_compiler *spirv; // GLSL -> SPIR-V compiler
+    struct vk_cmdpool **pools;    // command pools (one per queue family)
+    int num_pools;
+    struct vk_cmd *last_cmd;      // most recently submitted command
+
+    // Queued/pending commands. These are shared for the entire mpvk_ctx to
+    // ensure submission and callbacks are FIFO
+    struct vk_cmd **cmds_queued;  // recorded but not yet submitted
+    struct vk_cmd **cmds_pending; // submitted but not completed
+    int num_cmds_queued;
+    int num_cmds_pending;
+
+    // Pointers into *pools
+    struct vk_cmdpool *pool_graphics; // required
+    struct vk_cmdpool *pool_compute;  // optional
+    struct vk_cmdpool *pool_transfer; // optional
+
+    // Common pool of signals, to avoid having to re-create these objects often
+    struct vk_signal **signals;
+    int num_signals;
 
     // Cached capabilities
     VkPhysicalDeviceLimits limits;
+    VkPhysicalDeviceFeatures features;
 };
diff --git a/video/out/vulkan/context.c b/video/out/vulkan/context.c
index 0bca198..cbe0911 100644
--- a/video/out/vulkan/context.c
+++ b/video/out/vulkan/context.c
@@ -102,11 +102,18 @@ const struct m_sub_options vulkan_conf = {
                    {"fifo-relaxed", SWAP_FIFO_RELAXED},
                    {"mailbox",      SWAP_MAILBOX},
                    {"immediate",    SWAP_IMMEDIATE})),
-        OPT_INTRANGE("vulkan-queue-count", dev_opts.queue_count, 0, 1,
-                     MPVK_MAX_QUEUES, OPTDEF_INT(1)),
+        OPT_INTRANGE("vulkan-queue-count", dev_opts.queue_count, 0, 1, 8,
+                     OPTDEF_INT(1)),
+        OPT_FLAG("vulkan-async-transfer", dev_opts.async_transfer, 0),
+        OPT_FLAG("vulkan-async-compute", dev_opts.async_compute, 0),
         {0}
     },
-    .size = sizeof(struct vulkan_opts)
+    .size = sizeof(struct vulkan_opts),
+    .defaults = &(struct vulkan_opts) {
+        .dev_opts = {
+            .async_transfer = 1,
+        },
+    },
 };
 
 struct priv {
@@ -121,9 +128,10 @@ struct priv {
     // state of the images:
     struct ra_tex **images;   // ra_tex wrappers for the vkimages
     int num_images;           // size of images
-    VkSemaphore *acquired;    // pool of semaphores used to synchronize images
-    int num_acquired;         // size of this pool
-    int idx_acquired;         // index of next free semaphore within this pool
+    VkSemaphore *sems_in;     // pool of semaphores used to synchronize images
+    VkSemaphore *sems_out;    // outgoing semaphores (rendering complete)
+    int num_sems;
+    int idx_sems;             // index of next free semaphore pair
     int last_imgidx;          // the image index last acquired (for submit)
 };
 
@@ -244,17 +252,17 @@ void ra_vk_ctx_uninit(struct ra_ctx *ctx)
         struct priv *p = ctx->swapchain->priv;
         struct mpvk_ctx *vk = p->vk;
 
-        mpvk_pool_wait_idle(vk, vk->pool);
+        mpvk_flush_commands(vk);
+        mpvk_poll_commands(vk, UINT64_MAX);
 
         for (int i = 0; i < p->num_images; i++)
             ra_tex_free(ctx->ra, &p->images[i]);
-        for (int i = 0; i < p->num_acquired; i++)
-            vkDestroySemaphore(vk->dev, p->acquired[i], MPVK_ALLOCATOR);
+        for (int i = 0; i < p->num_sems; i++) {
+            vkDestroySemaphore(vk->dev, p->sems_in[i], MPVK_ALLOCATOR);
+            vkDestroySemaphore(vk->dev, p->sems_out[i], MPVK_ALLOCATOR);
+        }
 
         vkDestroySwapchainKHR(vk->dev, p->swapchain, MPVK_ALLOCATOR);
-
-        talloc_free(p->images);
-        talloc_free(p->acquired);
         ctx->ra->fns->destroy(ctx->ra);
         ctx->ra = NULL;
     }
@@ -355,7 +363,7 @@ bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h)
     // more than one swapchain already active, so we need to flush any pending
     // asynchronous swapchain release operations that may be ongoing.
     while (p->old_swapchain)
-        mpvk_dev_poll_cmds(vk, 100000); // 100μs
+        mpvk_poll_commands(vk, 100000); // 100μs
 
     VkSwapchainCreateInfoKHR sinfo = p->protoInfo;
     sinfo.imageExtent  = (VkExtent2D){ w, h };
@@ -382,13 +390,19 @@ bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h)
     VK(vkGetSwapchainImagesKHR(vk->dev, p->swapchain, &num, vkimages));
 
     // If needed, allocate some more semaphores
-    while (num > p->num_acquired) {
-        VkSemaphore sem;
+    while (num > p->num_sems) {
+        VkSemaphore sem_in, sem_out;
         static const VkSemaphoreCreateInfo seminfo = {
             .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
         };
-        VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem));
-        MP_TARRAY_APPEND(NULL, p->acquired, p->num_acquired, sem);
+        VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem_in));
+        VK(vkCreateSemaphore(vk->dev, &seminfo, MPVK_ALLOCATOR, &sem_out));
+
+        int idx = p->num_sems++;
+        MP_TARRAY_GROW(p, p->sems_in, idx);
+        MP_TARRAY_GROW(p, p->sems_out, idx);
+        p->sems_in[idx] = sem_in;
+        p->sems_out[idx] = sem_out;
     }
 
     // Recreate the ra_tex wrappers
@@ -396,7 +410,7 @@ bool ra_vk_ctx_resize(struct ra_swapchain *sw, int w, int h)
         ra_tex_free(ra, &p->images[i]);
 
     p->num_images = num;
-    MP_TARRAY_GROW(NULL, p->images, p->num_images);
+    MP_TARRAY_GROW(p, p->images, p->num_images);
     for (int i = 0; i < num; i++) {
         p->images[i] = ra_vk_wrap_swapchain_img(ra, vkimages[i], sinfo);
         if (!p->images[i])
@@ -439,66 +453,107 @@ static bool start_frame(struct ra_swapchain *sw, struct ra_fbo *out_fbo)
     struct priv *p = sw->priv;
     struct mpvk_ctx *vk = p->vk;
     if (!p->swapchain)
-        goto error;
+        return false;
+
+    VkSemaphore sem_in = p->sems_in[p->idx_sems];
+    MP_TRACE(vk, "vkAcquireNextImageKHR signals %p\n", (void *)sem_in);
+
+    for (int attempts = 0; attempts < 2; attempts++) {
+        uint32_t imgidx = 0;
+        VkResult res = vkAcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
+                                             sem_in, NULL, &imgidx);
+
+        switch (res) {
+        case VK_SUCCESS:
+            p->last_imgidx = imgidx;
+            *out_fbo = (struct ra_fbo) {
+                .tex = p->images[imgidx],
+                .flip = false,
+            };
+            ra_tex_vk_external_dep(sw->ctx->ra, out_fbo->tex, sem_in);
+            return true;
+
+        case VK_ERROR_OUT_OF_DATE_KHR: {
+            // In these cases try recreating the swapchain
+            int w = p->w, h = p->h;
+            p->w = p->h = 0; // invalidate the current state
+            if (!ra_vk_ctx_resize(sw, w, h))
+                return false;
+            continue;
+        }
 
-    uint32_t imgidx = 0;
-    MP_TRACE(vk, "vkAcquireNextImageKHR\n");
-    VkResult res = vkAcquireNextImageKHR(vk->dev, p->swapchain, UINT64_MAX,
-                                         p->acquired[p->idx_acquired], NULL,
-                                         &imgidx);
-    if (res == VK_ERROR_OUT_OF_DATE_KHR)
-        goto error; // just return in this case
-    VK_ASSERT(res, "Failed acquiring swapchain image");
-
-    p->last_imgidx = imgidx;
-    *out_fbo = (struct ra_fbo) {
-        .tex = p->images[imgidx],
-        .flip = false,
-    };
-    return true;
+        default:
+            MP_ERR(vk, "Failed acquiring swapchain image: %s\n", vk_err(res));
+            return false;
+        }
+    }
 
-error:
+    // If we've exhausted the number of attempts to recreate the swapchain,
+    // just give up silently.
     return false;
 }
 
+static void present_cb(struct priv *p, void *arg)
+{
+    p->frames_in_flight--;
+}
+
 static bool submit_frame(struct ra_swapchain *sw, const struct vo_frame *frame)
 {
     struct priv *p = sw->priv;
     struct ra *ra = sw->ctx->ra;
     struct mpvk_ctx *vk = p->vk;
     if (!p->swapchain)
-        goto error;
+        return false;
 
-    VkSemaphore acquired = p->acquired[p->idx_acquired++];
-    p->idx_acquired %= p->num_acquired;
+    struct vk_cmd *cmd = ra_vk_submit(ra, p->images[p->last_imgidx]);
+    if (!cmd)
+        return false;
 
-    VkSemaphore done;
-    if (!ra_vk_submit(ra, p->images[p->last_imgidx], acquired, &done,
-                      &p->frames_in_flight))
-        goto error;
+    VkSemaphore sem_out = p->sems_out[p->idx_sems++];
+    p->idx_sems %= p->num_sems;
+    vk_cmd_sig(cmd, sem_out);
+
+    p->frames_in_flight++;
+    vk_cmd_callback(cmd, (vk_cb) present_cb, p, NULL);
+
+    vk_cmd_queue(vk, cmd);
+    if (!mpvk_flush_commands(vk))
+        return false;
 
     // Older nvidia drivers can spontaneously combust when submitting to the
     // same queue as we're rendering from, in a multi-queue scenario. Safest
-    // option is to cycle the queues first and then submit to the next queue.
+    // option is to flush the commands first and then submit to the next queue.
     // We can drop this hack in the future, I suppose.
-    vk_cmd_cycle_queues(vk);
-    struct vk_cmdpool *pool = vk->pool;
-    VkQueue queue = pool->queues[pool->qindex];
+    struct vk_cmdpool *pool = vk->pool_graphics;
+    VkQueue queue = pool->queues[pool->idx_queues];
 
     VkPresentInfoKHR pinfo = {
         .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR,
         .waitSemaphoreCount = 1,
-        .pWaitSemaphores = &done,
+        .pWaitSemaphores = &sem_out,
         .swapchainCount = 1,
         .pSwapchains = &p->swapchain,
         .pImageIndices = &p->last_imgidx,
     };
 
-    VK(vkQueuePresentKHR(queue, &pinfo));
-    return true;
+    MP_TRACE(vk, "vkQueuePresentKHR waits on %p\n", (void *)sem_out);
+    VkResult res = vkQueuePresentKHR(queue, &pinfo);
+    switch (res) {
+    case VK_SUCCESS:
+    case VK_SUBOPTIMAL_KHR:
+        return true;
 
-error:
-    return false;
+    case VK_ERROR_OUT_OF_DATE_KHR:
+        // We can silently ignore this error, since the next start_frame will
+        // recreate the swapchain automatically.
+        return true;
+
+    default:
+        MP_ERR(vk, "Failed presenting to queue %p: %s\n", (void *)queue,
+               vk_err(res));
+        return false;
+    }
 }
 
 static void swap_buffers(struct ra_swapchain *sw)
@@ -506,11 +561,10 @@ static void swap_buffers(struct ra_swapchain *sw)
     struct priv *p = sw->priv;
 
     while (p->frames_in_flight >= sw->ctx->opts.swapchain_depth)
-        mpvk_dev_poll_cmds(p->vk, 100000); // 100μs
+        mpvk_poll_commands(p->vk, 100000); // 100μs
 }
 
 static const struct ra_swapchain_fns vulkan_swapchain = {
-    // .screenshot is not currently supported
     .color_depth   = color_depth,
     .start_frame   = start_frame,
     .submit_frame  = submit_frame,
diff --git a/video/out/vulkan/formats.c b/video/out/vulkan/formats.c
index b44bead..327a7ac 100644
--- a/video/out/vulkan/formats.c
+++ b/video/out/vulkan/formats.c
@@ -25,7 +25,7 @@ const struct vk_format vk_formats[] = {
     {"rg4",      VK_FORMAT_R4G4_UNORM_PACK8,          2,  1,   {4,  4         }, RA_CTYPE_UNORM },
     {"rgba4",    VK_FORMAT_R4G4B4A4_UNORM_PACK16,     4,  2,   {4,  4,  4,  4 }, RA_CTYPE_UNORM },
     {"rgb565",   VK_FORMAT_R5G6B5_UNORM_PACK16,       3,  2,   {5,  6,  5     }, RA_CTYPE_UNORM },
-    {"rgb565a1", VK_FORMAT_R5G5B5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM },
+    {"rgb5a1",   VK_FORMAT_R5G5B5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM },
 
     // Float formats (native formats, hf = half float, df = double float)
     {"r16hf",    VK_FORMAT_R16_SFLOAT,                1,  2,   {16            }, RA_CTYPE_FLOAT },
@@ -46,7 +46,7 @@ const struct vk_format vk_formats[] = {
     {"bgra8",    VK_FORMAT_B8G8R8A8_UNORM,            4,  4,   {8,  8,  8,  8 }, RA_CTYPE_UNORM, true },
     {"bgra4",    VK_FORMAT_B4G4R4A4_UNORM_PACK16,     4,  2,   {4,  4,  4,  4 }, RA_CTYPE_UNORM, true },
     {"bgr565",   VK_FORMAT_B5G6R5_UNORM_PACK16,       3,  2,   {5,  6,  5     }, RA_CTYPE_UNORM, true },
-    {"bgr565a1", VK_FORMAT_B5G5R5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM, true },
+    {"bgr5a1",   VK_FORMAT_B5G5R5A1_UNORM_PACK16,     4,  2,   {5,  5,  5,  1 }, RA_CTYPE_UNORM, true },
     {"a1rgb5",   VK_FORMAT_A1R5G5B5_UNORM_PACK16,     4,  2,   {1,  5,  5,  5 }, RA_CTYPE_UNORM, true },
     {"a2rgb10",  VK_FORMAT_A2R10G10B10_UNORM_PACK32,  4,  4,   {2,  10, 10, 10}, RA_CTYPE_UNORM, true },
     {"a2bgr10",  VK_FORMAT_A2B10G10R10_UNORM_PACK32,  4,  4,   {2,  10, 10, 10}, RA_CTYPE_UNORM, true },
diff --git a/video/out/vulkan/malloc.c b/video/out/vulkan/malloc.c
index f6cb114..32c2c6b 100644
--- a/video/out/vulkan/malloc.c
+++ b/video/out/vulkan/malloc.c
@@ -133,11 +133,22 @@ static struct vk_slab *slab_alloc(struct mpvk_ctx *vk, struct vk_heap *heap,
 
     uint32_t typeBits = heap->typeBits ? heap->typeBits : UINT32_MAX;
     if (heap->usage) {
+        // FIXME: Since we can't keep track of queue family ownership properly,
+        // and we don't know in advance what types of queue families this buffer
+        // will belong to, we're forced to share all of our buffers between all
+        // command pools.
+        uint32_t qfs[3] = {0};
+        for (int i = 0; i < vk->num_pools; i++)
+            qfs[i] = vk->pools[i]->qf;
+
         VkBufferCreateInfo binfo = {
             .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
             .size  = slab->size,
             .usage = heap->usage,
-            .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
+            .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
+                                             : VK_SHARING_MODE_EXCLUSIVE,
+            .queueFamilyIndexCount = vk->num_pools,
+            .pQueueFamilyIndices = qfs,
         };
 
         VK(vkCreateBuffer(vk->dev, &binfo, MPVK_ALLOCATOR, &slab->buffer));
diff --git a/video/out/vulkan/ra_vk.c b/video/out/vulkan/ra_vk.c
index f85e30e..cffb895 100644
--- a/video/out/vulkan/ra_vk.c
+++ b/video/out/vulkan/ra_vk.c
@@ -6,6 +6,12 @@
 
 static struct ra_fns ra_fns_vk;
 
+enum queue_type {
+    GRAPHICS,
+    COMPUTE,
+    TRANSFER,
+};
+
 // For ra.priv
 struct ra_vk {
     struct mpvk_ctx *vk;
@@ -22,51 +28,57 @@ struct mpvk_ctx *ra_vk_get(struct ra *ra)
     return p->vk;
 }
 
-// Returns a command buffer, or NULL on error
-static struct vk_cmd *vk_require_cmd(struct ra *ra)
-{
-    struct ra_vk *p = ra->priv;
-    struct mpvk_ctx *vk = ra_vk_get(ra);
-
-    if (!p->cmd)
-        p->cmd = vk_cmd_begin(vk, vk->pool);
-
-    return p->cmd;
-}
-
-// Note: This technically follows the flush() API, but we don't need
-// to expose that (and in fact, it's a bad idea) since we control flushing
-// behavior with ra_vk_present_frame already.
-static bool vk_flush(struct ra *ra, VkSemaphore *done)
+static void vk_submit(struct ra *ra)
 {
     struct ra_vk *p = ra->priv;
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
     if (p->cmd) {
-        if (!vk_cmd_submit(vk, p->cmd, done))
-            return false;
+        vk_cmd_queue(vk, p->cmd);
         p->cmd = NULL;
     }
-
-    return true;
 }
 
-// The callback's *priv will always be set to `ra`
-static void vk_callback(struct ra *ra, vk_cb callback, void *arg)
+// Returns a command buffer, or NULL on error
+static struct vk_cmd *vk_require_cmd(struct ra *ra, enum queue_type type)
 {
     struct ra_vk *p = ra->priv;
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
-    if (p->cmd) {
-        vk_cmd_callback(p->cmd, callback, ra, arg);
-    } else {
-        vk_dev_callback(vk, callback, ra, arg);
+    struct vk_cmdpool *pool;
+    switch (type) {
+    case GRAPHICS: pool = vk->pool_graphics; break;
+    case COMPUTE:  pool = vk->pool_compute;  break;
+
+    // GRAPHICS and COMPUTE also imply TRANSFER capability (vulkan spec)
+    case TRANSFER:
+        pool = vk->pool_transfer;
+        if (!pool)
+            pool = vk->pool_compute;
+        if (!pool)
+            pool = vk->pool_graphics;
+        break;
+    default: abort();
     }
+
+    assert(pool);
+    if (p->cmd && p->cmd->pool == pool)
+        return p->cmd;
+
+    vk_submit(ra);
+    p->cmd = vk_cmd_begin(vk, pool);
+    return p->cmd;
 }
 
 #define MAKE_LAZY_DESTRUCTOR(fun, argtype)                  \
     static void fun##_lazy(struct ra *ra, argtype *arg) {   \
-        vk_callback(ra, (vk_cb) fun, arg);                  \
+        struct ra_vk *p = ra->priv;                         \
+        struct mpvk_ctx *vk = ra_vk_get(ra);                \
+        if (p->cmd) {                                       \
+            vk_cmd_callback(p->cmd, (vk_cb) fun, ra, arg);  \
+        } else {                                            \
+            vk_dev_callback(vk, (vk_cb) fun, ra, arg);      \
+        }                                                   \
     }
 
 static void vk_destroy_ra(struct ra *ra)
@@ -74,8 +86,9 @@ static void vk_destroy_ra(struct ra *ra)
     struct ra_vk *p = ra->priv;
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
-    vk_flush(ra, NULL);
-    mpvk_dev_wait_idle(vk);
+    vk_submit(ra);
+    mpvk_flush_commands(vk);
+    mpvk_poll_commands(vk, UINT64_MAX);
     ra_tex_free(ra, &p->clear_tex);
 
     talloc_free(ra);
@@ -195,8 +208,13 @@ struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
     ra->max_shmem = vk->limits.maxComputeSharedMemorySize;
     ra->max_pushc_size = vk->limits.maxPushConstantsSize;
 
-    if (vk->pool->props.queueFlags & VK_QUEUE_COMPUTE_BIT)
-        ra->caps |= RA_CAP_COMPUTE;
+    if (vk->pool_compute) {
+        ra->caps |= RA_CAP_COMPUTE | RA_CAP_NUM_GROUPS;
+        // If we have more compute queues than graphics queues, we probably
+        // want to be using them. (This seems mostly relevant for AMD)
+        if (vk->pool_compute->num_queues > vk->pool_graphics->num_queues)
+            ra->caps |= RA_CAP_PARALLEL_COMPUTE;
+    }
 
     if (!vk_setup_formats(ra))
         goto error;
@@ -204,8 +222,8 @@ struct ra *ra_create_vk(struct mpvk_ctx *vk, struct mp_log *log)
     // UBO support is required
     ra->caps |= RA_CAP_BUF_RO | RA_CAP_FRAGCOORD;
 
-    // textureGather is only supported in GLSL 400+
-    if (ra->glsl_version >= 400)
+    // textureGather requires the ImageGatherExtended capability
+    if (vk->features.shaderImageGatherExtended)
         ra->caps |= RA_CAP_GATHER;
 
     // Try creating a shader storage buffer
@@ -246,9 +264,13 @@ error:
 }
 
 // Boilerplate wrapper around vkCreateRenderPass to ensure passes remain
-// compatible
+// compatible. The renderpass will automatically transition the image out of
+// initialLayout and into finalLayout.
 static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
-                                      bool load_fbo, VkRenderPass *out)
+                                      VkAttachmentLoadOp loadOp,
+                                      VkImageLayout initialLayout,
+                                      VkImageLayout finalLayout,
+                                      VkRenderPass *out)
 {
     struct vk_format *vk_fmt = fmt->priv;
     assert(fmt->renderable);
@@ -259,12 +281,10 @@ static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
         .pAttachments = &(VkAttachmentDescription) {
             .format = vk_fmt->iformat,
             .samples = VK_SAMPLE_COUNT_1_BIT,
-            .loadOp = load_fbo ? VK_ATTACHMENT_LOAD_OP_LOAD
-                               : VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+            .loadOp = loadOp,
             .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
-            .initialLayout = load_fbo ? VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL
-                                      : VK_IMAGE_LAYOUT_UNDEFINED,
-            .finalLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL,
+            .initialLayout = initialLayout,
+            .finalLayout = finalLayout,
         },
         .subpassCount = 1,
         .pSubpasses = &(VkSubpassDescription) {
@@ -283,6 +303,7 @@ static VkResult vk_create_render_pass(VkDevice dev, const struct ra_format *fmt,
 // For ra_tex.priv
 struct ra_tex_vk {
     bool external_img;
+    enum queue_type upload_queue;
     VkImageType type;
     VkImage img;
     struct vk_memslice mem;
@@ -296,16 +317,34 @@ struct ra_tex_vk {
     struct ra_buf_pool pbo;
     // "current" metadata, can change during the course of execution
     VkImageLayout current_layout;
-    VkPipelineStageFlags current_stage;
     VkAccessFlags current_access;
+    // the signal guards reuse, and can be NULL
+    struct vk_signal *sig;
+    VkPipelineStageFlags sig_stage;
+    VkSemaphore ext_dep; // external semaphore, not owned by the ra_tex
 };
 
+void ra_tex_vk_external_dep(struct ra *ra, struct ra_tex *tex, VkSemaphore dep)
+{
+    struct ra_tex_vk *tex_vk = tex->priv;
+    assert(!tex_vk->ext_dep);
+    tex_vk->ext_dep = dep;
+}
+
 // Small helper to ease image barrier creation. if `discard` is set, the contents
 // of the image will be undefined after the barrier
-static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk,
-                        VkPipelineStageFlags newStage, VkAccessFlags newAccess,
+static void tex_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_tex *tex,
+                        VkPipelineStageFlags stage, VkAccessFlags newAccess,
                         VkImageLayout newLayout, bool discard)
 {
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    struct ra_tex_vk *tex_vk = tex->priv;
+
+    if (tex_vk->ext_dep) {
+        vk_cmd_dep(cmd, tex_vk->ext_dep, stage);
+        tex_vk->ext_dep = NULL;
+    }
+
     VkImageMemoryBarrier imgBarrier = {
         .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
         .oldLayout = tex_vk->current_layout,
@@ -323,18 +362,43 @@ static void tex_barrier(struct vk_cmd *cmd, struct ra_tex_vk *tex_vk,
         imgBarrier.srcAccessMask = 0;
     }
 
-    if (imgBarrier.oldLayout != imgBarrier.newLayout ||
-        imgBarrier.srcAccessMask != imgBarrier.dstAccessMask)
-    {
-        vkCmdPipelineBarrier(cmd->buf, tex_vk->current_stage, newStage, 0,
-                             0, NULL, 0, NULL, 1, &imgBarrier);
+    VkEvent event = NULL;
+    vk_cmd_wait(vk, cmd, &tex_vk->sig, stage, &event);
+
+    bool need_trans = tex_vk->current_layout != newLayout ||
+                      tex_vk->current_access != newAccess;
+
+    // Transitioning to VK_IMAGE_LAYOUT_UNDEFINED is a pseudo-operation
+    // that for us means we don't need to perform the actual transition
+    if (need_trans && newLayout != VK_IMAGE_LAYOUT_UNDEFINED) {
+        if (event) {
+            vkCmdWaitEvents(cmd->buf, 1, &event, tex_vk->sig_stage,
+                            stage, 0, NULL, 0, NULL, 1, &imgBarrier);
+        } else {
+            // If we're not using an event, then the source stage is irrelevant
+            // because we're coming from a different queue anyway, so we can
+            // safely set it to TOP_OF_PIPE.
+            imgBarrier.srcAccessMask = 0;
+            vkCmdPipelineBarrier(cmd->buf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
+                                 stage, 0, 0, NULL, 0, NULL, 1, &imgBarrier);
+        }
     }
 
-    tex_vk->current_stage = newStage;
     tex_vk->current_layout = newLayout;
     tex_vk->current_access = newAccess;
 }
 
+static void tex_signal(struct ra *ra, struct vk_cmd *cmd, struct ra_tex *tex,
+                       VkPipelineStageFlags stage)
+{
+    struct ra_tex_vk *tex_vk = tex->priv;
+    struct mpvk_ctx *vk = ra_vk_get(ra);
+    assert(!tex_vk->sig);
+
+    tex_vk->sig = vk_cmd_signal(vk, cmd, stage);
+    tex_vk->sig_stage = stage;
+}
+
 static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex)
 {
     if (!tex)
@@ -344,6 +408,7 @@ static void vk_tex_destroy(struct ra *ra, struct ra_tex *tex)
     struct ra_tex_vk *tex_vk = tex->priv;
 
     ra_buf_pool_uninit(ra, &tex_vk->pbo);
+    vk_signal_destroy(vk, &tex_vk->sig);
     vkDestroyFramebuffer(vk->dev, tex_vk->framebuffer, MPVK_ALLOCATOR);
     vkDestroyRenderPass(vk->dev, tex_vk->dummyPass, MPVK_ALLOCATOR);
     vkDestroySampler(vk->dev, tex_vk->sampler, MPVK_ALLOCATOR);
@@ -368,7 +433,6 @@ static bool vk_init_image(struct ra *ra, struct ra_tex *tex)
     assert(tex_vk->img);
 
     tex_vk->current_layout = VK_IMAGE_LAYOUT_UNDEFINED;
-    tex_vk->current_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
     tex_vk->current_access = 0;
 
     if (params->render_src || params->render_dst) {
@@ -415,7 +479,11 @@ static bool vk_init_image(struct ra *ra, struct ra_tex *tex)
         // Framebuffers need to be created against a specific render pass
         // layout, so we need to temporarily create a skeleton/dummy render
         // pass for vulkan to figure out the compatibility
-        VK(vk_create_render_pass(vk->dev, params->format, false, &tex_vk->dummyPass));
+        VK(vk_create_render_pass(vk->dev, params->format,
+                                 VK_ATTACHMENT_LOAD_OP_DONT_CARE,
+                                 VK_IMAGE_LAYOUT_UNDEFINED,
+                                 VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
+                                 &tex_vk->dummyPass));
 
         VkFramebufferCreateInfo finfo = {
             .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO,
@@ -444,12 +512,14 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
                                     const struct ra_tex_params *params)
 {
     struct mpvk_ctx *vk = ra_vk_get(ra);
+    assert(!params->format->dummy_format);
 
     struct ra_tex *tex = talloc_zero(NULL, struct ra_tex);
     tex->params = *params;
     tex->params.initial_data = NULL;
 
     struct ra_tex_vk *tex_vk = tex->priv = talloc_zero(tex, struct ra_tex_vk);
+    tex_vk->upload_queue = GRAPHICS;
 
     const struct vk_format *fmt = params->format->priv;
     switch (params->dimensions) {
@@ -471,6 +541,10 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
     if (params->host_mutable || params->blit_dst || params->initial_data)
         usage |= VK_IMAGE_USAGE_TRANSFER_DST_BIT;
 
+    // Always use the transfer pool if available, for efficiency
+    if (params->host_mutable && vk->pool_transfer)
+        tex_vk->upload_queue = TRANSFER;
+
     // Double-check image usage support and fail immediately if invalid
     VkImageFormatProperties iprop;
     VkResult res = vkGetPhysicalDeviceImageFormatProperties(vk->physd,
@@ -498,6 +572,14 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
         return NULL;
     }
 
+    // FIXME: Since we can't keep track of queue family ownership properly,
+    // and we don't know in advance what types of queue families this image
+    // will belong to, we're forced to share all of our images between all
+    // command pools.
+    uint32_t qfs[3] = {0};
+    for (int i = 0; i < vk->num_pools; i++)
+        qfs[i] = vk->pools[i]->qf;
+
     VkImageCreateInfo iinfo = {
         .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
         .imageType = tex_vk->type,
@@ -509,9 +591,10 @@ static struct ra_tex *vk_tex_create(struct ra *ra,
         .tiling = VK_IMAGE_TILING_OPTIMAL,
         .usage = usage,
         .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
-        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
-        .queueFamilyIndexCount = 1,
-        .pQueueFamilyIndices = &vk->pool->qf,
+        .sharingMode = vk->num_pools > 1 ? VK_SHARING_MODE_CONCURRENT
+                                         : VK_SHARING_MODE_EXCLUSIVE,
+        .queueFamilyIndexCount = vk->num_pools,
+        .pQueueFamilyIndices = qfs,
     };
 
     VK(vkCreateImage(vk->dev, &iinfo, MPVK_ALLOCATOR, &tex_vk->img));
@@ -602,6 +685,7 @@ struct ra_buf_vk {
     struct vk_bufslice slice;
     int refcount; // 1 = object allocated but not in use, > 1 = in use
     bool needsflush;
+    enum queue_type update_queue;
     // "current" metadata, can change during course of execution
     VkPipelineStageFlags current_stage;
     VkAccessFlags current_access;
@@ -631,6 +715,8 @@ static void buf_barrier(struct ra *ra, struct vk_cmd *cmd, struct ra_buf *buf,
         .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
         .srcAccessMask = buf_vk->current_access,
         .dstAccessMask = newAccess,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
         .buffer = buf_vk->slice.buf,
         .offset = offset,
         .size = size,
@@ -670,7 +756,7 @@ static void vk_buf_update(struct ra *ra, struct ra_buf *buf, ptrdiff_t offset,
         memcpy((void *)addr, data, size);
         buf_vk->needsflush = true;
     } else {
-        struct vk_cmd *cmd = vk_require_cmd(ra);
+        struct vk_cmd *cmd = vk_require_cmd(ra, buf_vk->update_queue);
         if (!cmd) {
             MP_ERR(ra, "Failed updating buffer!\n");
             return;
@@ -706,6 +792,9 @@ static struct ra_buf *vk_buf_create(struct ra *ra,
     case RA_BUF_TYPE_TEX_UPLOAD:
         bufFlags |= VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
         memFlags |= VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+        // Use TRANSFER-style updates for large enough buffers for efficiency
+        if (params->size > 1024*1024) // 1 MB
+            buf_vk->update_queue = TRANSFER;
         break;
     case RA_BUF_TYPE_UNIFORM:
         bufFlags |= VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT;
@@ -716,6 +805,7 @@ static struct ra_buf *vk_buf_create(struct ra *ra,
         bufFlags |= VK_BUFFER_USAGE_STORAGE_BUFFER_BIT;
         memFlags |= VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
         align = MP_ALIGN_UP(align, vk->limits.minStorageBufferOffsetAlignment);
+        buf_vk->update_queue = COMPUTE;
         break;
     case RA_BUF_TYPE_VERTEX:
         bufFlags |= VK_BUFFER_USAGE_VERTEX_BUFFER_BIT;
@@ -802,14 +892,14 @@ static bool vk_tex_upload(struct ra *ra,
     uint64_t size = region.bufferRowLength * region.bufferImageHeight *
                     region.imageExtent.depth;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, tex_vk->upload_queue);
     if (!cmd)
         goto error;
 
     buf_barrier(ra, cmd, buf, VK_PIPELINE_STAGE_TRANSFER_BIT,
                 VK_ACCESS_TRANSFER_READ_BIT, region.bufferOffset, size);
 
-    tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+    tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT,
                 VK_ACCESS_TRANSFER_WRITE_BIT,
                 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                 params->invalidate);
@@ -817,6 +907,8 @@ static bool vk_tex_upload(struct ra *ra,
     vkCmdCopyBufferToImage(cmd->buf, buf_vk->slice.buf, tex_vk->img,
                            tex_vk->current_layout, 1, &region);
 
+    tex_signal(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT);
+
     return true;
 
 error:
@@ -831,6 +923,8 @@ struct ra_renderpass_vk {
     VkPipeline pipe;
     VkPipelineLayout pipeLayout;
     VkRenderPass renderPass;
+    VkImageLayout initialLayout;
+    VkImageLayout finalLayout;
     // Descriptor set (bindings)
     VkDescriptorSetLayout dsLayout;
     VkDescriptorPool dsPool;
@@ -1158,8 +1252,27 @@ static struct ra_renderpass *vk_renderpass_create(struct ra *ra,
                 goto error;
             }
         }
-        VK(vk_create_render_pass(vk->dev, params->target_format,
-                                 params->enable_blend, &pass_vk->renderPass));
+
+        // This is the most common case, so optimize towards it. In this case,
+        // the renderpass will take care of almost all layout transitions
+        pass_vk->initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+        pass_vk->finalLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+        VkAttachmentLoadOp loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+
+        // If we're blending, then we need to explicitly load the previous
+        // contents of the color attachment
+        if (pass->params.enable_blend)
+            loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+
+        // If we're invalidating the target, we don't need to load or transition
+        if (pass->params.invalidate_target) {
+            pass_vk->initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+            loadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+        }
+
+        VK(vk_create_render_pass(vk->dev, params->target_format, loadOp,
+                                 pass_vk->initialLayout, pass_vk->finalLayout,
+                                 &pass_vk->renderPass));
 
         static const VkBlendFactor blendFactors[] = {
             [RA_BLEND_ZERO]                = VK_BLEND_FACTOR_ZERO,
@@ -1312,6 +1425,11 @@ error:
     return pass;
 }
 
+static const VkPipelineStageFlags passStages[] = {
+    [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+    [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+};
+
 static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd,
                                  struct ra_renderpass *pass,
                                  struct ra_renderpass_input_val val,
@@ -1329,18 +1447,13 @@ static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd,
         .descriptorType = dsType[inp->type],
     };
 
-    static const VkPipelineStageFlags passStages[] = {
-        [RA_RENDERPASS_TYPE_RASTER]  = VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
-        [RA_RENDERPASS_TYPE_COMPUTE] = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-    };
-
     switch (inp->type) {
     case RA_VARTYPE_TEX: {
         struct ra_tex *tex = *(struct ra_tex **)val.data;
         struct ra_tex_vk *tex_vk = tex->priv;
 
         assert(tex->params.render_src);
-        tex_barrier(cmd, tex_vk, passStages[pass->params.type],
+        tex_barrier(ra, cmd, tex, passStages[pass->params.type],
                     VK_ACCESS_SHADER_READ_BIT,
                     VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, false);
 
@@ -1359,7 +1472,7 @@ static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd,
         struct ra_tex_vk *tex_vk = tex->priv;
 
         assert(tex->params.storage_dst);
-        tex_barrier(cmd, tex_vk, passStages[pass->params.type],
+        tex_barrier(ra, cmd, tex, passStages[pass->params.type],
                     VK_ACCESS_SHADER_WRITE_BIT,
                     VK_IMAGE_LAYOUT_GENERAL, false);
 
@@ -1397,6 +1510,22 @@ static void vk_update_descriptor(struct ra *ra, struct vk_cmd *cmd,
     }
 }
 
+static void vk_release_descriptor(struct ra *ra, struct vk_cmd *cmd,
+                                  struct ra_renderpass *pass,
+                                  struct ra_renderpass_input_val val)
+{
+    struct ra_renderpass_input *inp = &pass->params.inputs[val.index];
+
+    switch (inp->type) {
+    case RA_VARTYPE_IMG_W:
+    case RA_VARTYPE_TEX: {
+        struct ra_tex *tex = *(struct ra_tex **)val.data;
+        tex_signal(ra, cmd, tex, passStages[pass->params.type]);
+        break;
+    }
+    }
+}
+
 static void vk_renderpass_run(struct ra *ra,
                               const struct ra_renderpass_run_params *params)
 {
@@ -1404,7 +1533,12 @@ static void vk_renderpass_run(struct ra *ra,
     struct ra_renderpass *pass = params->pass;
     struct ra_renderpass_vk *pass_vk = pass->priv;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    static const enum queue_type types[] = {
+        [RA_RENDERPASS_TYPE_RASTER]  = GRAPHICS,
+        [RA_RENDERPASS_TYPE_COMPUTE] = COMPUTE,
+    };
+
+    struct vk_cmd *cmd = vk_require_cmd(ra, types[pass->params.type]);
     if (!cmd)
         goto error;
 
@@ -1469,13 +1603,9 @@ static void vk_renderpass_run(struct ra *ra,
         vkCmdBindVertexBuffers(cmd->buf, 0, 1, &buf_vk->slice.buf,
                                &buf_vk->slice.mem.offset);
 
-        if (pass->params.enable_blend) {
-            // Normally this transition is handled implicitly by the renderpass,
-            // but if we need to preserve the FBO we have to do it manually.
-            tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
-                        VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT,
-                        VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, false);
-        }
+        tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
+                    VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, pass_vk->initialLayout,
+                    pass->params.invalidate_target);
 
         VkViewport viewport = {
             .x = params->viewport.x0,
@@ -1504,14 +1634,21 @@ static void vk_renderpass_run(struct ra *ra,
         vkCmdEndRenderPass(cmd->buf);
 
         // The renderPass implicitly transitions the texture to this layout
-        tex_vk->current_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
-        tex_vk->current_access = VK_ACCESS_SHADER_READ_BIT;
-        tex_vk->current_stage = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+        tex_vk->current_layout = pass_vk->finalLayout;
+        tex_vk->current_access = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+        tex_signal(ra, cmd, tex, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT);
         break;
     }
     default: abort();
     };
 
+    for (int i = 0; i < params->num_values; i++)
+        vk_release_descriptor(ra, cmd, pass, params->values[i]);
+
+    // flush the work so far into its own command buffer, for better cross-frame
+    // granularity
+    vk_submit(ra);
+
 error:
     return;
 }
@@ -1525,11 +1662,11 @@ static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
     struct ra_tex_vk *src_vk = src->priv;
     struct ra_tex_vk *dst_vk = dst->priv;
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
-    tex_barrier(cmd, src_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+    tex_barrier(ra, cmd, src, VK_PIPELINE_STAGE_TRANSFER_BIT,
                 VK_ACCESS_TRANSFER_READ_BIT,
                 VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
                 false);
@@ -1539,20 +1676,46 @@ static void vk_blit(struct ra *ra, struct ra_tex *dst, struct ra_tex *src,
                    dst_rc->x1 == dst->params.w &&
                    dst_rc->y1 == dst->params.h;
 
-    tex_barrier(cmd, dst_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+    tex_barrier(ra, cmd, dst, VK_PIPELINE_STAGE_TRANSFER_BIT,
                 VK_ACCESS_TRANSFER_WRITE_BIT,
                 VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
                 discard);
 
-    VkImageBlit region = {
-        .srcSubresource = vk_layers,
-        .srcOffsets = {{src_rc->x0, src_rc->y0, 0}, {src_rc->x1, src_rc->y1, 1}},
-        .dstSubresource = vk_layers,
-        .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0}, {dst_rc->x1, dst_rc->y1, 1}},
-    };
+    // Under certain conditions we can use vkCmdCopyImage instead of
+    // vkCmdBlitImage, namely when the blit operation does not require
+    // scaling. and the formats are compatible.
+    if (src->params.format->pixel_size == dst->params.format->pixel_size &&
+        mp_rect_w(*src_rc) == mp_rect_w(*dst_rc) &&
+        mp_rect_h(*src_rc) == mp_rect_h(*dst_rc) &&
+        mp_rect_w(*src_rc) >= 0 && mp_rect_h(*src_rc) >= 0)
+    {
+        VkImageCopy region = {
+            .srcSubresource = vk_layers,
+            .dstSubresource = vk_layers,
+            .srcOffset = {src_rc->x0, src_rc->y0, 0},
+            .dstOffset = {dst_rc->x0, dst_rc->y0, 0},
+            .extent = {mp_rect_w(*src_rc), mp_rect_h(*src_rc), 1},
+        };
 
-    vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout, dst_vk->img,
-                   dst_vk->current_layout, 1, &region, VK_FILTER_NEAREST);
+        vkCmdCopyImage(cmd->buf, src_vk->img, src_vk->current_layout,
+                       dst_vk->img, dst_vk->current_layout, 1, &region);
+    } else {
+        VkImageBlit region = {
+            .srcSubresource = vk_layers,
+            .dstSubresource = vk_layers,
+            .srcOffsets = {{src_rc->x0, src_rc->y0, 0},
+                           {src_rc->x1, src_rc->y1, 1}},
+            .dstOffsets = {{dst_rc->x0, dst_rc->y0, 0},
+                           {dst_rc->x1, dst_rc->y1, 1}},
+        };
+
+        vkCmdBlitImage(cmd->buf, src_vk->img, src_vk->current_layout,
+                       dst_vk->img, dst_vk->current_layout, 1, &region,
+                       VK_FILTER_NEAREST);
+    }
+
+    tex_signal(ra, cmd, src, VK_PIPELINE_STAGE_TRANSFER_BIT);
+    tex_signal(ra, cmd, dst, VK_PIPELINE_STAGE_TRANSFER_BIT);
 }
 
 static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
@@ -1562,14 +1725,14 @@ static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
     struct ra_tex_vk *tex_vk = tex->priv;
     assert(tex->params.blit_dst);
 
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
     struct mp_rect full = {0, 0, tex->params.w, tex->params.h};
     if (!rc || mp_rect_equals(rc, &full)) {
         // To clear the entire image, we can use the efficient clear command
-        tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_TRANSFER_BIT,
+        tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT,
                     VK_ACCESS_TRANSFER_WRITE_BIT,
                     VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, true);
 
@@ -1579,6 +1742,8 @@ static void vk_clear(struct ra *ra, struct ra_tex *tex, float color[4],
 
         vkCmdClearColorImage(cmd->buf, tex_vk->img, tex_vk->current_layout,
                              &clearColor, 1, &vk_range);
+
+        tex_signal(ra, cmd, tex, VK_PIPELINE_STAGE_TRANSFER_BIT);
     } else {
         // To simulate per-region clearing, we blit from a 1x1 texture instead
         struct ra_tex_upload_params ul_params = {
@@ -1600,6 +1765,7 @@ static int vk_desc_namespace(enum ra_vartype type)
 
 struct vk_timer {
     VkQueryPool pool;
+    int index_seen; // keeps track of which indices have been used at least once
     int index;
     uint64_t result;
 };
@@ -1624,6 +1790,7 @@ static ra_timer *vk_timer_create(struct ra *ra)
     struct mpvk_ctx *vk = ra_vk_get(ra);
 
     struct vk_timer *timer = talloc_zero(NULL, struct vk_timer);
+    timer->index_seen = -1;
 
     struct VkQueryPoolCreateInfo qinfo = {
         .sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
@@ -1643,7 +1810,7 @@ error:
 static void vk_timer_record(struct ra *ra, VkQueryPool pool, int index,
                             VkPipelineStageFlags stage)
 {
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
         return;
 
@@ -1655,12 +1822,15 @@ static void vk_timer_start(struct ra *ra, ra_timer *ratimer)
     struct mpvk_ctx *vk = ra_vk_get(ra);
     struct vk_timer *timer = ratimer;
 
-    timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE;
-
+    VkResult res = VK_NOT_READY;
     uint64_t out[2];
-    VkResult res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2,
-                                         sizeof(out), &out[0], sizeof(uint64_t),
-                                         VK_QUERY_RESULT_64_BIT);
+
+    if (timer->index <= timer->index_seen) {
+        res = vkGetQueryPoolResults(vk->dev, timer->pool, timer->index, 2,
+                                    sizeof(out), &out[0], sizeof(uint64_t),
+                                    VK_QUERY_RESULT_64_BIT);
+    }
+
     switch (res) {
     case VK_SUCCESS:
         timer->result = (out[1] - out[0]) * vk->limits.timestampPeriod;
@@ -1683,6 +1853,9 @@ static uint64_t vk_timer_stop(struct ra *ra, ra_timer *ratimer)
     vk_timer_record(ra, timer->pool, timer->index + 1,
                     VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT);
 
+    timer->index_seen = MPMAX(timer->index_seen, timer->index);
+    timer->index = (timer->index + 2) % VK_QUERY_POOL_SIZE;
+
     return timer->result;
 }
 
@@ -1709,39 +1882,20 @@ static struct ra_fns ra_fns_vk = {
     .timer_stop             = vk_timer_stop,
 };
 
-static void present_cb(void *priv, int *inflight)
-{
-    *inflight -= 1;
-}
-
-bool ra_vk_submit(struct ra *ra, struct ra_tex *tex, VkSemaphore acquired,
-                  VkSemaphore *done, int *inflight)
+struct vk_cmd *ra_vk_submit(struct ra *ra, struct ra_tex *tex)
 {
-    struct vk_cmd *cmd = vk_require_cmd(ra);
+    struct ra_vk *p = ra->priv;
+    struct vk_cmd *cmd = vk_require_cmd(ra, GRAPHICS);
     if (!cmd)
-        goto error;
-
-    if (inflight) {
-        *inflight += 1;
-        vk_cmd_callback(cmd, (vk_cb)present_cb, NULL, inflight);
-    }
+        return NULL;
 
     struct ra_tex_vk *tex_vk = tex->priv;
     assert(tex_vk->external_img);
-    tex_barrier(cmd, tex_vk, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, 0,
-                VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, false);
-
-    // These are the only two stages that we use/support for actually
-    // outputting to swapchain imagechain images, so just add a dependency
-    // on both of them. In theory, we could maybe come up with some more
-    // advanced mechanism of tracking dynamic dependencies, but that seems
-    // like overkill.
-    vk_cmd_dep(cmd, acquired,
-               VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT |
-               VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-    return vk_flush(ra, done);
+    tex_barrier(ra, cmd, tex, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                VK_ACCESS_MEMORY_READ_BIT, VK_IMAGE_LAYOUT_PRESENT_SRC_KHR,
+                false);
 
-error:
-    return false;
+    // Return this directly instead of going through vk_submit
+    p->cmd = NULL;
+    return cmd;
 }
diff --git a/video/out/vulkan/ra_vk.h b/video/out/vulkan/ra_vk.h
index 893421b..da613c7 100644
--- a/video/out/vulkan/ra_vk.h
+++ b/video/out/vulkan/ra_vk.h
@@ -16,15 +16,15 @@ VkDevice ra_vk_get_dev(struct ra *ra);
 struct ra_tex *ra_vk_wrap_swapchain_img(struct ra *ra, VkImage vkimg,
                                         VkSwapchainCreateInfoKHR info);
 
-// This function flushes the command buffers, transitions `tex` (which must be
-// a wrapped swapchain image) into a format suitable for presentation, and
-// submits the current rendering commands. The indicated semaphore must fire
-// before the submitted command can run. If `done` is non-NULL, it will be
-// set to a semaphore that fires once the command completes. If `inflight`
-// is non-NULL, it will be incremented when the command starts and decremented
-// when it completes.
-bool ra_vk_submit(struct ra *ra, struct ra_tex *tex, VkSemaphore acquired,
-                  VkSemaphore *done, int *inflight);
+// Associates an external semaphore (dependency) with a ra_tex, such that this
+// ra_tex will not be used by the ra_vk until the external semaphore fires.
+void ra_tex_vk_external_dep(struct ra *ra, struct ra_tex *tex, VkSemaphore dep);
+
+// This function finalizes rendering, transitions `tex` (which must be a
+// wrapped swapchain image) into a format suitable for presentation, and returns
+// the resulting command buffer (or NULL on error). The caller may add their
+// own semaphores to this command buffer, and must submit it afterwards.
+struct vk_cmd *ra_vk_submit(struct ra *ra, struct ra_tex *tex);
 
 // May be called on a struct ra of any type. Returns NULL if the ra is not
 // a vulkan ra.
diff --git a/video/out/vulkan/utils.c b/video/out/vulkan/utils.c
index baf0ebc..cfe9737 100644
--- a/video/out/vulkan/utils.c
+++ b/video/out/vulkan/utils.c
@@ -128,20 +128,10 @@ static VkBool32 vk_dbg_callback(VkDebugReportFlagsEXT flags,
     return (flags & VK_DEBUG_REPORT_ERROR_BIT_EXT);
 }
 
-static void vk_cmdpool_uninit(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
-{
-    if (!pool)
-        return;
-
-    // also frees associated command buffers
-    vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR);
-    for (int n = 0; n < MPVK_MAX_CMDS; n++) {
-        vkDestroyFence(vk->dev, pool->cmds[n].fence, MPVK_ALLOCATOR);
-        vkDestroySemaphore(vk->dev, pool->cmds[n].done, MPVK_ALLOCATOR);
-        talloc_free(pool->cmds[n].callbacks);
-    }
-    talloc_free(pool);
-}
+static void vk_cmdpool_destroy(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
+static struct vk_cmdpool *vk_cmdpool_create(struct mpvk_ctx *vk,
+                                            VkDeviceQueueCreateInfo qinfo,
+                                            VkQueueFamilyProperties props);
 
 void mpvk_uninit(struct mpvk_ctx *vk)
 {
@@ -149,7 +139,18 @@ void mpvk_uninit(struct mpvk_ctx *vk)
         return;
 
     if (vk->dev) {
-        vk_cmdpool_uninit(vk, vk->pool);
+        mpvk_flush_commands(vk);
+        mpvk_poll_commands(vk, UINT64_MAX);
+        assert(vk->num_cmds_queued == 0);
+        assert(vk->num_cmds_pending == 0);
+        talloc_free(vk->cmds_queued);
+        talloc_free(vk->cmds_pending);
+        for (int i = 0; i < vk->num_pools; i++)
+            vk_cmdpool_destroy(vk, vk->pools[i]);
+        talloc_free(vk->pools);
+        for (int i = 0; i < vk->num_signals; i++)
+            vk_signal_destroy(vk, &vk->signals[i]);
+        talloc_free(vk->signals);
         vk_malloc_uninit(vk);
         vkDestroyDevice(vk->dev, MPVK_ALLOCATOR);
     }
@@ -315,6 +316,7 @@ bool mpvk_find_phys_device(struct mpvk_ctx *vk, const char *name, bool sw)
                     (int)VK_VERSION_PATCH(prop.apiVersion));
             vk->physd = devices[i];
             vk->limits = prop.limits;
+            vkGetPhysicalDeviceFeatures(vk->physd, &vk->features);
             talloc_free(devices);
             return true;
         }
@@ -384,62 +386,56 @@ error:
     return false;
 }
 
-static bool vk_cmdpool_init(struct mpvk_ctx *vk, VkDeviceQueueCreateInfo qinfo,
-                            VkQueueFamilyProperties props,
-                            struct vk_cmdpool **out)
+// Find the most specialized queue supported a combination of flags. In cases
+// where there are multiple queue families at the same specialization level,
+// this finds the one with the most queues. Returns -1 if no queue was found.
+static int find_qf(VkQueueFamilyProperties *qfs, int qfnum, VkQueueFlags flags)
 {
-    struct vk_cmdpool *pool = *out = talloc_ptrtype(NULL, pool);
-    *pool = (struct vk_cmdpool) {
-        .qf = qinfo.queueFamilyIndex,
-        .props = props,
-        .qcount = qinfo.queueCount,
-    };
-
-    for (int n = 0; n < pool->qcount; n++)
-        vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]);
-
-    VkCommandPoolCreateInfo cinfo = {
-        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
-        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
-                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
-        .queueFamilyIndex = pool->qf,
-    };
-
-    VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool));
-
-    VkCommandBufferAllocateInfo ainfo = {
-        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
-        .commandPool = pool->pool,
-        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
-        .commandBufferCount = MPVK_MAX_CMDS,
-    };
+    int idx = -1;
+    for (int i = 0; i < qfnum; i++) {
+        if (!(qfs[i].queueFlags & flags))
+            continue;
 
-    VkCommandBuffer cmdbufs[MPVK_MAX_CMDS];
-    VK(vkAllocateCommandBuffers(vk->dev, &ainfo, cmdbufs));
+        // QF is more specialized. Since we don't care about other bits like
+        // SPARSE_BIT, mask the ones we're interestew in
+        const VkQueueFlags mask = VK_QUEUE_GRAPHICS_BIT |
+                                  VK_QUEUE_TRANSFER_BIT |
+                                  VK_QUEUE_COMPUTE_BIT;
 
-    for (int n = 0; n < MPVK_MAX_CMDS; n++) {
-        struct vk_cmd *cmd = &pool->cmds[n];
-        cmd->pool = pool;
-        cmd->buf = cmdbufs[n];
+        if (idx < 0 || (qfs[i].queueFlags & mask) < (qfs[idx].queueFlags & mask))
+            idx = i;
 
-        VkFenceCreateInfo finfo = {
-            .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
-            .flags = VK_FENCE_CREATE_SIGNALED_BIT,
-        };
+        // QF has more queues (at the same specialization level)
+        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
+            qfs[i].queueCount > qfs[idx].queueCount)
+            idx = i;
+    }
 
-        VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence));
+    return idx;
+}
 
-        VkSemaphoreCreateInfo sinfo = {
-            .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
-        };
+static void add_qinfo(void *tactx, VkDeviceQueueCreateInfo **qinfos,
+                      int *num_qinfos, VkQueueFamilyProperties *qfs, int idx,
+                      int qcount)
+{
+    if (idx < 0)
+        return;
 
-        VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &cmd->done));
+    // Check to see if we've already added this queue family
+    for (int i = 0; i < *num_qinfos; i++) {
+        if ((*qinfos)[i].queueFamilyIndex == idx)
+            return;
     }
 
-    return true;
+    float *priorities = talloc_zero_array(tactx, float, qcount);
+    VkDeviceQueueCreateInfo qinfo = {
+        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
+        .queueFamilyIndex = idx,
+        .queueCount = MPMIN(qcount, qfs[idx].queueCount),
+        .pQueuePriorities = priorities,
+    };
 
-error:
-    return false;
+    MP_TARRAY_APPEND(tactx, *qinfos, *num_qinfos, qinfo);
 }
 
 bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
@@ -460,47 +456,42 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
                    (unsigned)qfs[i].queueFlags, (int)qfs[i].queueCount);
     }
 
-    // For most of our rendering operations, we want to use one "primary" pool,
-    // so just pick the queue family with the most features.
-    int idx = -1;
-    for (int i = 0; i < qfnum; i++) {
-        if (!(qfs[i].queueFlags & VK_QUEUE_GRAPHICS_BIT))
-            continue;
-
-        // QF supports more features
-        if (idx < 0 || qfs[i].queueFlags > qfs[idx].queueFlags)
-            idx = i;
-
-        // QF supports more queues (at the same specialization level)
-        if (qfs[i].queueFlags == qfs[idx].queueFlags &&
-            qfs[i].queueCount > qfs[idx].queueCount)
-        {
-            idx = i;
-        }
-    }
+    int idx_gfx = -1, idx_comp = -1, idx_tf = -1;
+    idx_gfx = find_qf(qfs, qfnum, VK_QUEUE_GRAPHICS_BIT);
+    if (opts.async_compute)
+        idx_comp = find_qf(qfs, qfnum, VK_QUEUE_COMPUTE_BIT);
+    if (opts.async_transfer)
+        idx_tf = find_qf(qfs, qfnum, VK_QUEUE_TRANSFER_BIT);
 
     // Vulkan requires at least one GRAPHICS queue, so if this fails something
     // is horribly wrong.
-    assert(idx >= 0);
+    assert(idx_gfx >= 0);
+    MP_VERBOSE(vk, "Using graphics queue (QF %d)\n", idx_gfx);
 
     // Ensure we can actually present to the surface using this queue
     VkBool32 sup;
-    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx, vk->surf, &sup));
+    VK(vkGetPhysicalDeviceSurfaceSupportKHR(vk->physd, idx_gfx, vk->surf, &sup));
     if (!sup) {
         MP_ERR(vk, "Queue family does not support surface presentation!\n");
         goto error;
     }
 
-    // Now that we know which queue families we want, we can create the logical
-    // device
-    assert(opts.queue_count <= MPVK_MAX_QUEUES);
-    static const float priorities[MPVK_MAX_QUEUES] = {0};
-    VkDeviceQueueCreateInfo qinfo = {
-        .sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,
-        .queueFamilyIndex = idx,
-        .queueCount = MPMIN(qfs[idx].queueCount, opts.queue_count),
-        .pQueuePriorities = priorities,
-    };
+    if (idx_tf >= 0 && idx_tf != idx_gfx)
+        MP_VERBOSE(vk, "Using async transfer (QF %d)\n", idx_tf);
+    if (idx_comp >= 0 && idx_comp != idx_gfx)
+        MP_VERBOSE(vk, "Using async compute (QF %d)\n", idx_comp);
+
+    // Fall back to supporting compute shaders via the graphics pool for
+    // devices which support compute shaders but not async compute.
+    if (idx_comp < 0 && qfs[idx_gfx].queueFlags & VK_QUEUE_COMPUTE_BIT)
+        idx_comp = idx_gfx;
+
+    // Now that we know which QFs we want, we can create the logical device
+    VkDeviceQueueCreateInfo *qinfos = NULL;
+    int num_qinfos = 0;
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_gfx, opts.queue_count);
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_comp, opts.queue_count);
+    add_qinfo(tmp, &qinfos, &num_qinfos, qfs, idx_tf, opts.queue_count);
 
     const char **exts = NULL;
     int num_exts = 0;
@@ -508,12 +499,21 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
     if (vk->spirv->required_ext)
         MP_TARRAY_APPEND(tmp, exts, num_exts, vk->spirv->required_ext);
 
+    // Enable all features we optionally use
+#define FEATURE(name) .name = vk->features.name
+    VkPhysicalDeviceFeatures feats = {
+        FEATURE(shaderImageGatherExtended),
+        FEATURE(shaderStorageImageExtendedFormats),
+    };
+#undef FEATURE
+
     VkDeviceCreateInfo dinfo = {
         .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,
-        .queueCreateInfoCount = 1,
-        .pQueueCreateInfos = &qinfo,
+        .pQueueCreateInfos = qinfos,
+        .queueCreateInfoCount = num_qinfos,
         .ppEnabledExtensionNames = exts,
         .enabledExtensionCount = num_exts,
+        .pEnabledFeatures = &feats,
     };
 
     MP_VERBOSE(vk, "Creating vulkan device with extensions:\n");
@@ -522,12 +522,24 @@ bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts)
 
     VK(vkCreateDevice(vk->physd, &dinfo, MPVK_ALLOCATOR, &vk->dev));
 
-    vk_malloc_init(vk);
-
-    // Create the vk_cmdpools and all required queues / synchronization objects
-    if (!vk_cmdpool_init(vk, qinfo, qfs[idx], &vk->pool))
-        goto error;
+    // Create the command pools and memory allocator
+    for (int i = 0; i < num_qinfos; i++) {
+        int qf = qinfos[i].queueFamilyIndex;
+        struct vk_cmdpool *pool = vk_cmdpool_create(vk, qinfos[i], qfs[qf]);
+        if (!pool)
+            goto error;
+        MP_TARRAY_APPEND(NULL, vk->pools, vk->num_pools, pool);
+
+        // Update the pool_* pointers based on the corresponding QF index
+        if (qf == idx_gfx)
+            vk->pool_graphics = pool;
+        if (qf == idx_comp)
+            vk->pool_compute = pool;
+        if (qf == idx_tf)
+            vk->pool_transfer = pool;
+    }
 
+    vk_malloc_init(vk);
     talloc_free(tmp);
     return true;
 
@@ -537,83 +549,197 @@ error:
     return false;
 }
 
-static void run_callbacks(struct mpvk_ctx *vk, struct vk_cmd *cmd)
+// returns VK_SUCCESS (completed), VK_TIMEOUT (not yet completed) or an error
+static VkResult vk_cmd_poll(struct mpvk_ctx *vk, struct vk_cmd *cmd,
+                            uint64_t timeout)
+{
+    return vkWaitForFences(vk->dev, 1, &cmd->fence, false, timeout);
+}
+
+static void vk_cmd_reset(struct mpvk_ctx *vk, struct vk_cmd *cmd)
 {
     for (int i = 0; i < cmd->num_callbacks; i++) {
         struct vk_callback *cb = &cmd->callbacks[i];
         cb->run(cb->priv, cb->arg);
-        *cb = (struct vk_callback){0};
     }
 
     cmd->num_callbacks = 0;
+    cmd->num_deps = 0;
+    cmd->num_sigs = 0;
 
-    // Also reset vk->last_cmd in case this was the last command to run
+    // also make sure to reset vk->last_cmd in case this was the last command
     if (vk->last_cmd == cmd)
         vk->last_cmd = NULL;
 }
 
-static void wait_for_cmds(struct mpvk_ctx *vk, struct vk_cmd cmds[], int num)
+static void vk_cmd_destroy(struct mpvk_ctx *vk, struct vk_cmd *cmd)
 {
-    if (!num)
+    if (!cmd)
         return;
 
-    VkFence fences[MPVK_MAX_CMDS];
-    for (int i = 0; i < num; i++)
-        fences[i] = cmds[i].fence;
+    vk_cmd_poll(vk, cmd, UINT64_MAX);
+    vk_cmd_reset(vk, cmd);
+    vkDestroyFence(vk->dev, cmd->fence, MPVK_ALLOCATOR);
+    vkFreeCommandBuffers(vk->dev, cmd->pool->pool, 1, &cmd->buf);
+
+    talloc_free(cmd);
+}
 
-    vkWaitForFences(vk->dev, num, fences, true, UINT64_MAX);
+static struct vk_cmd *vk_cmd_create(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
+{
+    struct vk_cmd *cmd = talloc_zero(NULL, struct vk_cmd);
+    cmd->pool = pool;
 
-    for (int i = 0; i < num; i++)
-        run_callbacks(vk, &cmds[i]);
+    VkCommandBufferAllocateInfo ainfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
+        .commandPool = pool->pool,
+        .level = VK_COMMAND_BUFFER_LEVEL_PRIMARY,
+        .commandBufferCount = 1,
+    };
+
+    VK(vkAllocateCommandBuffers(vk->dev, &ainfo, &cmd->buf));
+
+    VkFenceCreateInfo finfo = {
+        .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
+        .flags = VK_FENCE_CREATE_SIGNALED_BIT,
+    };
+
+    VK(vkCreateFence(vk->dev, &finfo, MPVK_ALLOCATOR, &cmd->fence));
+
+    return cmd;
+
+error:
+    vk_cmd_destroy(vk, cmd);
+    return NULL;
 }
 
-void mpvk_pool_wait_idle(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
+void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg)
 {
-    if (!pool)
-        return;
+    MP_TARRAY_APPEND(cmd, cmd->callbacks, cmd->num_callbacks, (struct vk_callback) {
+        .run  = callback,
+        .priv = p,
+        .arg  = arg,
+    });
+}
 
-    int idx = pool->cindex, pidx = pool->cindex_pending;
-    if (pidx < idx) { // range doesn't wrap
-        wait_for_cmds(vk, &pool->cmds[pidx], idx - pidx);
-    } else if (pidx > idx) { // range wraps
-        wait_for_cmds(vk, &pool->cmds[pidx], MPVK_MAX_CMDS - pidx);
-        wait_for_cmds(vk, &pool->cmds[0], idx);
-    }
-    pool->cindex_pending = pool->cindex;
+void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, VkPipelineStageFlags stage)
+{
+    int idx = cmd->num_deps++;
+    MP_TARRAY_GROW(cmd, cmd->deps, idx);
+    MP_TARRAY_GROW(cmd, cmd->depstages, idx);
+    cmd->deps[idx] = dep;
+    cmd->depstages[idx] = stage;
 }
 
-void mpvk_dev_wait_idle(struct mpvk_ctx *vk)
+void vk_cmd_sig(struct vk_cmd *cmd, VkSemaphore sig)
 {
-    mpvk_pool_wait_idle(vk, vk->pool);
+    MP_TARRAY_APPEND(cmd, cmd->sigs, cmd->num_sigs, sig);
 }
 
-void mpvk_pool_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
-                         uint64_t timeout)
+static void vk_cmdpool_destroy(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
 {
     if (!pool)
         return;
 
-    // If requested, hard block until at least one command completes
-    if (timeout > 0 && pool->cindex_pending != pool->cindex) {
-        vkWaitForFences(vk->dev, 1, &pool->cmds[pool->cindex_pending].fence,
-                        true, timeout);
-    }
+    for (int i = 0; i < pool->num_cmds; i++)
+        vk_cmd_destroy(vk, pool->cmds[i]);
+
+    vkDestroyCommandPool(vk->dev, pool->pool, MPVK_ALLOCATOR);
+    talloc_free(pool);
+}
+
+static struct vk_cmdpool *vk_cmdpool_create(struct mpvk_ctx *vk,
+                                            VkDeviceQueueCreateInfo qinfo,
+                                            VkQueueFamilyProperties props)
+{
+    struct vk_cmdpool *pool = talloc_ptrtype(NULL, pool);
+    *pool = (struct vk_cmdpool) {
+        .props = props,
+        .qf = qinfo.queueFamilyIndex,
+        .queues = talloc_array(pool, VkQueue, qinfo.queueCount),
+        .num_queues = qinfo.queueCount,
+    };
+
+    for (int n = 0; n < pool->num_queues; n++)
+        vkGetDeviceQueue(vk->dev, pool->qf, n, &pool->queues[n]);
+
+    VkCommandPoolCreateInfo cinfo = {
+        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT |
+                 VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .queueFamilyIndex = pool->qf,
+    };
+
+    VK(vkCreateCommandPool(vk->dev, &cinfo, MPVK_ALLOCATOR, &pool->pool));
+
+    return pool;
 
-    // Lazily garbage collect the commands based on their status
-    while (pool->cindex_pending != pool->cindex) {
-        struct vk_cmd *cmd = &pool->cmds[pool->cindex_pending];
-        VkResult res = vkGetFenceStatus(vk->dev, cmd->fence);
-        if (res != VK_SUCCESS)
+error:
+    vk_cmdpool_destroy(vk, pool);
+    return NULL;
+}
+
+void mpvk_poll_commands(struct mpvk_ctx *vk, uint64_t timeout)
+{
+    while (vk->num_cmds_pending > 0) {
+        struct vk_cmd *cmd = vk->cmds_pending[0];
+        struct vk_cmdpool *pool = cmd->pool;
+        VkResult res = vk_cmd_poll(vk, cmd, timeout);
+        if (res == VK_TIMEOUT)
             break;
-        run_callbacks(vk, cmd);
-        pool->cindex_pending++;
-        pool->cindex_pending %= MPVK_MAX_CMDS;
+        vk_cmd_reset(vk, cmd);
+        MP_TARRAY_REMOVE_AT(vk->cmds_pending, vk->num_cmds_pending, 0);
+        MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
     }
 }
 
-void mpvk_dev_poll_cmds(struct mpvk_ctx *vk, uint32_t timeout)
+bool mpvk_flush_commands(struct mpvk_ctx *vk)
 {
-    mpvk_pool_poll_cmds(vk, vk->pool, timeout);
+    bool ret = true;
+
+    for (int i = 0; i < vk->num_cmds_queued; i++) {
+        struct vk_cmd *cmd = vk->cmds_queued[i];
+        struct vk_cmdpool *pool = cmd->pool;
+
+        VkSubmitInfo sinfo = {
+            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
+            .commandBufferCount = 1,
+            .pCommandBuffers = &cmd->buf,
+            .waitSemaphoreCount = cmd->num_deps,
+            .pWaitSemaphores = cmd->deps,
+            .pWaitDstStageMask = cmd->depstages,
+            .signalSemaphoreCount = cmd->num_sigs,
+            .pSignalSemaphores = cmd->sigs,
+        };
+
+        VK(vkQueueSubmit(cmd->queue, 1, &sinfo, cmd->fence));
+        MP_TARRAY_APPEND(NULL, vk->cmds_pending, vk->num_cmds_pending, cmd);
+
+        if (mp_msg_test(vk->log, MSGL_TRACE)) {
+            MP_TRACE(vk, "Submitted command on queue %p (QF %d):\n",
+                     (void *)cmd->queue, pool->qf);
+            for (int n = 0; n < cmd->num_deps; n++)
+                MP_TRACE(vk, "    waits on semaphore %p\n", (void *)cmd->deps[n]);
+            for (int n = 0; n < cmd->num_sigs; n++)
+                MP_TRACE(vk, "    signals semaphore %p\n", (void *)cmd->sigs[n]);
+        }
+        continue;
+
+error:
+        vk_cmd_reset(vk, cmd);
+        MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
+        ret = false;
+    }
+
+    vk->num_cmds_queued = 0;
+
+    // Rotate the queues to ensure good parallelism across frames
+    for (int i = 0; i < vk->num_pools; i++) {
+        struct vk_cmdpool *pool = vk->pools[i];
+        pool->idx_queues = (pool->idx_queues + 1) % pool->num_queues;
+    }
+
+    return ret;
 }
 
 void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg)
@@ -626,39 +752,22 @@ void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg)
     }
 }
 
-void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg)
-{
-    MP_TARRAY_GROW(NULL, cmd->callbacks, cmd->num_callbacks);
-    cmd->callbacks[cmd->num_callbacks++] = (struct vk_callback) {
-        .run  = callback,
-        .priv = p,
-        .arg  = arg,
-    };
-}
-
-void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep,
-                VkPipelineStageFlags depstage)
-{
-    assert(cmd->num_deps < MPVK_MAX_CMD_DEPS);
-    cmd->deps[cmd->num_deps] = dep;
-    cmd->depstages[cmd->num_deps++] = depstage;
-}
-
 struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
 {
-    // Garbage collect the cmdpool first
-    mpvk_pool_poll_cmds(vk, pool, 0);
+    // garbage collect the cmdpool first, to increase the chances of getting
+    // an already-available command buffer
+    mpvk_poll_commands(vk, 0);
 
-    int next = (pool->cindex + 1) % MPVK_MAX_CMDS;
-    if (next == pool->cindex_pending) {
-        MP_ERR(vk, "No free command buffers!\n");
-        goto error;
-    }
+    struct vk_cmd *cmd = NULL;
+    if (MP_TARRAY_POP(pool->cmds, pool->num_cmds, &cmd))
+        goto done;
 
-    struct vk_cmd *cmd = &pool->cmds[pool->cindex];
-    pool->cindex = next;
+    // No free command buffers => allocate another one
+    cmd = vk_cmd_create(vk, pool);
+    if (!cmd)
+        goto error;
 
-    VK(vkResetCommandBuffer(cmd->buf, 0));
+done: ;
 
     VkCommandBufferBeginInfo binfo = {
         .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
@@ -667,54 +776,145 @@ struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool)
 
     VK(vkBeginCommandBuffer(cmd->buf, &binfo));
 
+    cmd->queue = pool->queues[pool->idx_queues];
     return cmd;
 
 error:
+    // Something has to be seriously messed up if we get to this point
+    vk_cmd_destroy(vk, cmd);
     return NULL;
 }
 
-bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done)
+void vk_cmd_queue(struct mpvk_ctx *vk, struct vk_cmd *cmd)
 {
+    struct vk_cmdpool *pool = cmd->pool;
+
     VK(vkEndCommandBuffer(cmd->buf));
 
-    struct vk_cmdpool *pool = cmd->pool;
-    VkQueue queue = pool->queues[pool->qindex];
+    VK(vkResetFences(vk->dev, 1, &cmd->fence));
+    MP_TARRAY_APPEND(NULL, vk->cmds_queued, vk->num_cmds_queued, cmd);
+    vk->last_cmd = cmd;
+    return;
 
-    VkSubmitInfo sinfo = {
-        .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
-        .commandBufferCount = 1,
-        .pCommandBuffers = &cmd->buf,
-        .waitSemaphoreCount = cmd->num_deps,
-        .pWaitSemaphores = cmd->deps,
-        .pWaitDstStageMask = cmd->depstages,
+error:
+    vk_cmd_reset(vk, cmd);
+    MP_TARRAY_APPEND(pool, pool->cmds, pool->num_cmds, cmd);
+}
+
+void vk_signal_destroy(struct mpvk_ctx *vk, struct vk_signal **sig)
+{
+    if (!*sig)
+        return;
+
+    vkDestroySemaphore(vk->dev, (*sig)->semaphore, MPVK_ALLOCATOR);
+    vkDestroyEvent(vk->dev, (*sig)->event, MPVK_ALLOCATOR);
+    talloc_free(*sig);
+    *sig = NULL;
+}
+
+struct vk_signal *vk_cmd_signal(struct mpvk_ctx *vk, struct vk_cmd *cmd,
+                                VkPipelineStageFlags stage)
+{
+    struct vk_signal *sig = NULL;
+    if (MP_TARRAY_POP(vk->signals, vk->num_signals, &sig))
+        goto done;
+
+    // no available signal => initialize a new one
+    sig = talloc_zero(NULL, struct vk_signal);
+    static const VkSemaphoreCreateInfo sinfo = {
+        .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO,
     };
 
-    if (done) {
-        sinfo.signalSemaphoreCount = 1;
-        sinfo.pSignalSemaphores = &cmd->done;
-        *done = cmd->done;
-    }
+    VK(vkCreateSemaphore(vk->dev, &sinfo, MPVK_ALLOCATOR, &sig->semaphore));
 
-    VK(vkResetFences(vk->dev, 1, &cmd->fence));
-    VK(vkQueueSubmit(queue, 1, &sinfo, cmd->fence));
-    MP_TRACE(vk, "Submitted command on queue %p (QF %d)\n", (void *)queue,
-             pool->qf);
+    static const VkEventCreateInfo einfo = {
+        .sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO,
+    };
 
-    for (int i = 0; i < cmd->num_deps; i++)
-        cmd->deps[i] = NULL;
-    cmd->num_deps = 0;
+    VK(vkCreateEvent(vk->dev, &einfo, MPVK_ALLOCATOR, &sig->event));
 
-    vk->last_cmd = cmd;
-    return true;
+done:
+    // Signal both the semaphore and the event if possible. (We will only
+    // end up using one or the other)
+    vk_cmd_sig(cmd, sig->semaphore);
+
+    VkQueueFlags req = VK_QUEUE_GRAPHICS_BIT | VK_QUEUE_COMPUTE_BIT;
+    if (cmd->pool->props.queueFlags & req) {
+        vkCmdSetEvent(cmd->buf, sig->event, stage);
+        sig->event_source = cmd->queue;
+    }
+
+    return sig;
 
 error:
+    vk_signal_destroy(vk, &sig);
+    return NULL;
+}
+
+static bool unsignal_cmd(struct vk_cmd *cmd, VkSemaphore sem)
+{
+    for (int n = 0; n < cmd->num_sigs; n++) {
+        if (cmd->sigs[n] == sem) {
+            MP_TARRAY_REMOVE_AT(cmd->sigs, cmd->num_sigs, n);
+            return true;
+        }
+    }
+
     return false;
 }
 
-void vk_cmd_cycle_queues(struct mpvk_ctx *vk)
+// Attempts to remove a queued signal operation. Returns true if sucessful,
+// i.e. the signal could be removed before it ever got fired.
+static bool unsignal(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore sem)
 {
-    struct vk_cmdpool *pool = vk->pool;
-    pool->qindex = (pool->qindex + 1) % pool->qcount;
+    if (unsignal_cmd(cmd, sem))
+        return true;
+
+    // Attempt to remove it from any queued commands
+    for (int i = 0; i < vk->num_cmds_queued; i++) {
+        if (unsignal_cmd(vk->cmds_queued[i], sem))
+            return true;
+    }
+
+    return false;
+}
+
+static void release_signal(struct mpvk_ctx *vk, struct vk_signal *sig)
+{
+    // The semaphore never needs to be recreated, because it's either
+    // unsignaled while still queued, or unsignaled as a result of a device
+    // wait. But the event *may* need to be reset, so just always reset it.
+    if (sig->event_source)
+        vkResetEvent(vk->dev, sig->event);
+    sig->event_source = NULL;
+    MP_TARRAY_APPEND(NULL, vk->signals, vk->num_signals, sig);
+}
+
+void vk_cmd_wait(struct mpvk_ctx *vk, struct vk_cmd *cmd,
+                 struct vk_signal **sigptr, VkPipelineStageFlags stage,
+                 VkEvent *out_event)
+{
+    struct vk_signal *sig = *sigptr;
+    if (!sig)
+        return;
+
+    if (out_event && sig->event && sig->event_source == cmd->queue &&
+        unsignal(vk, cmd, sig->semaphore))
+    {
+        // If we can remove the semaphore signal operation from the history and
+        // pretend it never happened, then we get to use the VkEvent. This also
+        // requires that the VkEvent was signalled from the same VkQueue.
+        *out_event = sig->event;
+    } else if (sig->semaphore) {
+        // Otherwise, we use the semaphore. (This also unsignals it as a result
+        // of the command execution)
+        vk_cmd_dep(cmd, sig->semaphore, stage);
+    }
+
+    // In either case, once the command completes, we can release the signal
+    // resource back to the pool.
+    vk_cmd_callback(cmd, (vk_cb) release_signal, vk, sig);
+    *sigptr = NULL;
 }
 
 const VkImageSubresourceRange vk_range = {
diff --git a/video/out/vulkan/utils.h b/video/out/vulkan/utils.h
index 0cc8a29..2962313 100644
--- a/video/out/vulkan/utils.h
+++ b/video/out/vulkan/utils.h
@@ -55,22 +55,26 @@ bool mpvk_pick_surface_format(struct mpvk_ctx *vk);
 
 struct mpvk_device_opts {
     int queue_count;    // number of queues to use
+    int async_transfer; // enable async transfer
+    int async_compute;  // enable async compute
 };
 
 // Create a logical device and initialize the vk_cmdpools
 bool mpvk_device_init(struct mpvk_ctx *vk, struct mpvk_device_opts opts);
 
-// Wait until all commands submitted to all queues have completed
-void mpvk_pool_wait_idle(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
-void mpvk_dev_wait_idle(struct mpvk_ctx *vk);
+// Wait for all currently pending commands to have completed. This is the only
+// function that actually processes the callbacks. Will wait at most `timeout`
+// nanoseconds for the completion of each command. Using it with a value of
+// UINT64_MAX effectively means waiting until the pool/device is idle. The
+// timeout may also be passed as 0, in which case this function will not block,
+// but only poll for completed commands.
+void mpvk_poll_commands(struct mpvk_ctx *vk, uint64_t timeout);
 
-// Wait until at least one command submitted to any queue has completed, and
-// process the callbacks. Good for event loops that need to delay until a
-// command completes. Will block at most `timeout` nanoseconds. If used with
-// 0, it only garbage collects completed commands without blocking.
-void mpvk_pool_poll_cmds(struct mpvk_ctx *vk, struct vk_cmdpool *pool,
-                         uint64_t timeout);
-void mpvk_dev_poll_cmds(struct mpvk_ctx *vk, uint32_t timeout);
+// Flush all currently queued commands. Call this once per frame, after
+// submitting all of the command buffers for that frame. Calling this more
+// often than that is possible but bad for performance.
+// Returns whether successful. Failed commands will be implicitly dropped.
+bool mpvk_flush_commands(struct mpvk_ctx *vk);
 
 // Since lots of vulkan operations need to be done lazily once the affected
 // resources are no longer in use, provide an abstraction for tracking these.
@@ -88,20 +92,22 @@ struct vk_callback {
 // This will essentially run once the device is completely idle.
 void vk_dev_callback(struct mpvk_ctx *vk, vk_cb callback, void *p, void *arg);
 
-#define MPVK_MAX_CMD_DEPS 8
-
 // Helper wrapper around command buffers that also track dependencies,
 // callbacks and synchronization primitives
 struct vk_cmd {
     struct vk_cmdpool *pool; // pool it was allocated from
-    VkCommandBuffer buf;
-    VkFence fence; // the fence guards cmd buffer reuse
-    VkSemaphore done; // the semaphore signals when execution is done
+    VkQueue queue;           // the submission queue (for recording/pending)
+    VkCommandBuffer buf;     // the command buffer itself
+    VkFence fence;           // the fence guards cmd buffer reuse
     // The semaphores represent dependencies that need to complete before
     // this command can be executed. These are *not* owned by the vk_cmd
-    VkSemaphore deps[MPVK_MAX_CMD_DEPS];
-    VkPipelineStageFlags depstages[MPVK_MAX_CMD_DEPS];
+    VkSemaphore *deps;
+    VkPipelineStageFlags *depstages;
     int num_deps;
+    // The signals represent semaphores that fire once the command finishes
+    // executing. These are also not owned by the vk_cmd
+    VkSemaphore *sigs;
+    int num_sigs;
     // Since VkFences are useless, we have to manually track "callbacks"
     // to fire once the VkFence completes. These are used for multiple purposes,
     // ranging from garbage collection (resource deallocation) to fencing.
@@ -113,41 +119,64 @@ struct vk_cmd {
 // bool will be set to `true` once the command completes, or shortly thereafter.
 void vk_cmd_callback(struct vk_cmd *cmd, vk_cb callback, void *p, void *arg);
 
-// Associate a dependency for the current command. This semaphore must signal
-// by the corresponding stage before the command may execute.
-void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep,
-                VkPipelineStageFlags depstage);
+// Associate a raw dependency for the current command. This semaphore must
+// signal by the corresponding stage before the command may execute.
+void vk_cmd_dep(struct vk_cmd *cmd, VkSemaphore dep, VkPipelineStageFlags stage);
+
+// Associate a raw signal with the current command. This semaphore will signal
+// after the command completes.
+void vk_cmd_sig(struct vk_cmd *cmd, VkSemaphore sig);
+
+// Signal abstraction: represents an abstract synchronization mechanism.
+// Internally, this may either resolve as a semaphore or an event depending
+// on whether the appropriate conditions are met.
+struct vk_signal {
+    VkSemaphore semaphore;
+    VkEvent event;
+    VkQueue event_source;
+};
 
-#define MPVK_MAX_QUEUES 8
-#define MPVK_MAX_CMDS 64
+// Generates a signal after the execution of all previous commands matching the
+// given the pipeline stage. The signal is owned by the caller, and must be
+// consumed eith vk_cmd_wait or released with vk_signal_cancel in order to
+// free the resources.
+struct vk_signal *vk_cmd_signal(struct mpvk_ctx *vk, struct vk_cmd *cmd,
+                                VkPipelineStageFlags stage);
+
+// Consumes a previously generated signal. This signal must fire by the
+// indicated stage before the command can run. If *event is not NULL, then it
+// MAY be set to a VkEvent which the caller MUST manually wait on in the most
+// appropriate way. This function takes over ownership of the signal (and the
+// signal will be released/reused automatically)
+void vk_cmd_wait(struct mpvk_ctx *vk, struct vk_cmd *cmd,
+                 struct vk_signal **sigptr, VkPipelineStageFlags stage,
+                 VkEvent *out_event);
+
+// Destroys a currently pending signal, for example if the resource is no
+// longer relevant.
+void vk_signal_destroy(struct mpvk_ctx *vk, struct vk_signal **sig);
 
 // Command pool / queue family hybrid abstraction
 struct vk_cmdpool {
     VkQueueFamilyProperties props;
-    uint32_t qf; // queue family index
+    int qf; // queue family index
     VkCommandPool pool;
-    VkQueue queues[MPVK_MAX_QUEUES];
-    int qcount;
-    int qindex;
-    // Command buffers associated with this queue
-    struct vk_cmd cmds[MPVK_MAX_CMDS];
-    int cindex;
-    int cindex_pending;
+    VkQueue *queues;
+    int num_queues;
+    int idx_queues;
+    // Command buffers associated with this queue. These are available for
+    // re-recording
+    struct vk_cmd **cmds;
+    int num_cmds;
 };
 
-// Fetch the next command buffer from a command pool and begin recording to it.
+// Fetch a command buffer from a command pool and begin recording to it.
 // Returns NULL on failure.
 struct vk_cmd *vk_cmd_begin(struct mpvk_ctx *vk, struct vk_cmdpool *pool);
 
-// Finish the currently recording command buffer and submit it for execution.
-// If `done` is not NULL, it will be set to a semaphore that will signal once
-// the command completes. (And MUST have a corresponding semaphore wait)
-// Returns whether successful.
-bool vk_cmd_submit(struct mpvk_ctx *vk, struct vk_cmd *cmd, VkSemaphore *done);
-
-// Rotate the queues for each vk_cmdpool. Call this once per frame to ensure
-// good parallelism between frames when using multiple queues
-void vk_cmd_cycle_queues(struct mpvk_ctx *vk);
+// Finish recording a command buffer and queue it for execution. This function
+// takes over ownership of *cmd, i.e. the caller should not touch it again.
+void vk_cmd_queue(struct mpvk_ctx *vk, struct vk_cmd *cmd);
 
 // Predefined structs for a simple non-layered, non-mipped image
 extern const VkImageSubresourceRange vk_range;
diff --git a/video/out/w32_common.c b/video/out/w32_common.c
index feeae81..de81b35 100644
--- a/video/out/w32_common.c
+++ b/video/out/w32_common.c
@@ -79,6 +79,7 @@ struct vo_w32_state {
     pthread_t thread;
     bool terminate;
     struct mp_dispatch_queue *dispatch; // used to run stuff on the GUI thread
+    bool in_dispatch;
 
     struct w32_api api; // stores functions from dynamically loaded DLLs
 
@@ -920,7 +921,11 @@ static LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam,
 
     // The dispatch queue should be processed as soon as possible to prevent
     // playback glitches, since it is likely blocking the VO thread
-    mp_dispatch_queue_process(w32->dispatch, 0);
+    if (!w32->in_dispatch) {
+        w32->in_dispatch = true;
+        mp_dispatch_queue_process(w32->dispatch, 0);
+        w32->in_dispatch = false;
+    }
 
     switch (message) {
     case WM_ERASEBKGND: // no need to erase background separately
diff --git a/video/out/wayland_common.c b/video/out/wayland_common.c
index 19adf01..0ed1468 100644
--- a/video/out/wayland_common.c
+++ b/video/out/wayland_common.c
@@ -26,8 +26,8 @@
 #include "win_state.h"
 #include "wayland_common.h"
 
-// Generated from xdg-shell-unstable-v6.xml
-#include "video/out/wayland/xdg-shell-v6.h"
+// Generated from xdg-shell.xml
+#include "video/out/wayland/xdg-shell.h"
 
 // Generated from idle-inhibit-unstable-v1.xml
 #include "video/out/wayland/idle-inhibit-v1.h"
@@ -35,12 +35,12 @@
 // Generated from server-decoration.xml
 #include "video/out/wayland/srv-decor.h"
 
-static void xdg_shell_ping(void *data, struct zxdg_shell_v6 *shell, uint32_t serial)
+static void xdg_shell_ping(void *data, struct xdg_wm_base *shell, uint32_t serial)
 {
-    zxdg_shell_v6_pong(shell, serial);
+    xdg_wm_base_pong(shell, serial);
 }
 
-static const struct zxdg_shell_v6_listener xdg_shell_listener = {
+static const struct xdg_wm_base_listener xdg_shell_listener = {
     xdg_shell_ping,
 };
 
@@ -125,7 +125,7 @@ static void pointer_handle_motion(void *data, struct wl_pointer *pointer,
 static void window_move(struct vo_wayland_state *wl, uint32_t serial)
 {
     if (wl->xdg_toplevel)
-        zxdg_toplevel_v6_move(wl->xdg_toplevel, wl->seat, serial);
+        xdg_toplevel_move(wl->xdg_toplevel, wl->seat, serial);
 }
 
 static void pointer_handle_button(void *data, struct wl_pointer *wl_pointer,
@@ -177,9 +177,9 @@ static const struct wl_pointer_listener pointer_listener = {
 };
 
 static int check_for_resize(struct vo_wayland_state *wl, wl_fixed_t x_w, wl_fixed_t y_w,
-                            enum zxdg_toplevel_v6_resize_edge *edge)
+                            enum xdg_toplevel_resize_edge *edge)
 {
-    if (wl->touch_entries || wl->fullscreen)
+    if (wl->touch_entries || wl->fullscreen || wl->maximized)
         return 0;
 
     const int edge_pixels = 64;
@@ -190,21 +190,21 @@ static int check_for_resize(struct vo_wayland_state *wl, wl_fixed_t x_w, wl_fixe
     int bottom_edge = pos[1] > (mp_rect_h(wl->geometry) - edge_pixels);
 
     if (left_edge) {
-        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_LEFT;
+        *edge = XDG_TOPLEVEL_RESIZE_EDGE_LEFT;
         if (top_edge)
-            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_TOP_LEFT;
+            *edge = XDG_TOPLEVEL_RESIZE_EDGE_TOP_LEFT;
         else if (bottom_edge)
-            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_BOTTOM_LEFT;
+            *edge = XDG_TOPLEVEL_RESIZE_EDGE_BOTTOM_LEFT;
     } else if (right_edge) {
-        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_RIGHT;
+        *edge = XDG_TOPLEVEL_RESIZE_EDGE_RIGHT;
         if (top_edge)
-            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_TOP_RIGHT;
+            *edge = XDG_TOPLEVEL_RESIZE_EDGE_TOP_RIGHT;
         else if (bottom_edge)
-            *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_BOTTOM_RIGHT;
+            *edge = XDG_TOPLEVEL_RESIZE_EDGE_BOTTOM_RIGHT;
     } else if (top_edge) {
-        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_TOP;
+        *edge = XDG_TOPLEVEL_RESIZE_EDGE_TOP;
     } else if (bottom_edge) {
-        *edge = ZXDG_TOPLEVEL_V6_RESIZE_EDGE_BOTTOM;
+        *edge = XDG_TOPLEVEL_RESIZE_EDGE_BOTTOM;
     } else {
         *edge = 0;
         return 0;
@@ -219,14 +219,14 @@ static void touch_handle_down(void *data, struct wl_touch *wl_touch,
 {
     struct vo_wayland_state *wl = data;
 
-    enum zxdg_toplevel_v6_resize_edge edge;
+    enum xdg_toplevel_resize_edge edge;
     if (check_for_resize(wl, x_w, y_w, &edge)) {
         wl->touch_entries = 0;
-        zxdg_toplevel_v6_resize(wl->xdg_toplevel, wl->seat, serial, edge);
+        xdg_toplevel_resize(wl->xdg_toplevel, wl->seat, serial, edge);
         return;
     } else if (wl->touch_entries) {
         wl->touch_entries = 0;
-        zxdg_toplevel_v6_move(wl->xdg_toplevel, wl->seat, serial);
+        xdg_toplevel_move(wl->xdg_toplevel, wl->seat, serial);
         return;
     }
 
@@ -782,9 +782,8 @@ static void registry_handle_add(void *data, struct wl_registry *reg, uint32_t id
     int found = 1;
     struct vo_wayland_state *wl = data;
 
-    if (!strcmp(interface, wl_compositor_interface.name) && found++) {
-        ver = MPMIN(ver, 4); /* Cap the version */
-        wl->compositor = wl_registry_bind(reg, id, &wl_compositor_interface, ver);
+    if (!strcmp(interface, wl_compositor_interface.name) && (ver >= 3) && found++) {
+        wl->compositor = wl_registry_bind(reg, id, &wl_compositor_interface, 3);
         wl->surface = wl_compositor_create_surface(wl->compositor);
         wl->cursor_surface = wl_compositor_create_surface(wl->compositor);
         wl_surface_add_listener(wl->surface, &surface_listener, wl);
@@ -805,9 +804,10 @@ static void registry_handle_add(void *data, struct wl_registry *reg, uint32_t id
         wl_list_insert(&wl->output_list, &output->link);
     }
 
-    if (!strcmp(interface, zxdg_shell_v6_interface.name) && found++) {
-        wl->shell = wl_registry_bind(reg, id, &zxdg_shell_v6_interface, 1);
-        zxdg_shell_v6_add_listener(wl->shell, &xdg_shell_listener, wl);
+    if (!strcmp(interface, xdg_wm_base_interface.name) && found++) {
+        ver = MPMIN(ver, 2); /* We can use either 1 or 2 */
+        wl->shell = wl_registry_bind(reg, id, &xdg_wm_base_interface, ver);
+        xdg_wm_base_add_listener(wl->shell, &xdg_shell_listener, wl);
     }
 
     if (!strcmp(interface, wl_seat_interface.name) && found++) {
@@ -866,38 +866,42 @@ static const struct wl_registry_listener registry_listener = {
     registry_handle_remove,
 };
 
-static void handle_surface_config(void *data, struct zxdg_surface_v6 *surface,
+static void handle_surface_config(void *data, struct xdg_surface *surface,
                                   uint32_t serial)
 {
-    zxdg_surface_v6_ack_configure(surface, serial);
+    xdg_surface_ack_configure(surface, serial);
 }
 
-static const struct zxdg_surface_v6_listener xdg_surface_listener = {
+static const struct xdg_surface_listener xdg_surface_listener = {
     handle_surface_config,
 };
 
-static void handle_toplevel_config(void *data, struct zxdg_toplevel_v6 *toplevel,
+static void handle_toplevel_config(void *data, struct xdg_toplevel *toplevel,
                                    int32_t width, int32_t height, struct wl_array *states)
 {
     struct vo_wayland_state *wl = data;
     struct mp_rect old_geometry = wl->geometry;
 
     int prev_fs_state = wl->fullscreen;
-    bool maximized = false;
+    wl->maximized = false;
     wl->fullscreen = false;
-    enum zxdg_toplevel_v6_state *state;
+    enum xdg_toplevel_state *state;
     wl_array_for_each(state, states) {
         switch (*state) {
-        case ZXDG_TOPLEVEL_V6_STATE_FULLSCREEN:
+        case XDG_TOPLEVEL_STATE_FULLSCREEN:
             wl->fullscreen = true;
             break;
-        case ZXDG_TOPLEVEL_V6_STATE_RESIZING:
+        case XDG_TOPLEVEL_STATE_RESIZING:
             wl->pending_vo_events |= VO_EVENT_LIVE_RESIZING;
             break;
-        case ZXDG_TOPLEVEL_V6_STATE_MAXIMIZED:
-            maximized = true;
+        case XDG_TOPLEVEL_STATE_ACTIVATED:
             break;
-        case ZXDG_TOPLEVEL_V6_STATE_ACTIVATED:
+        case XDG_TOPLEVEL_STATE_TILED_TOP:
+        case XDG_TOPLEVEL_STATE_TILED_LEFT:
+        case XDG_TOPLEVEL_STATE_TILED_RIGHT:
+        case XDG_TOPLEVEL_STATE_TILED_BOTTOM:
+        case XDG_TOPLEVEL_STATE_MAXIMIZED:
+            wl->maximized = true;
             break;
         }
     }
@@ -910,7 +914,7 @@ static void handle_toplevel_config(void *data, struct zxdg_toplevel_v6 *toplevel
     if (width > 0 && height > 0) {
         if (!wl->fullscreen) {
             if (wl->vo->opts->keepaspect && wl->vo->opts->keepaspect_window &&
-                !maximized) {
+                !wl->maximized) {
                 if (width > height)
                     width  = height * wl->aspect_ratio;
                 else
@@ -939,27 +943,27 @@ static void handle_toplevel_config(void *data, struct zxdg_toplevel_v6 *toplevel
     wl->pending_vo_events |= VO_EVENT_RESIZE;
 }
 
-static void handle_toplevel_close(void *data, struct zxdg_toplevel_v6 *xdg_toplevel)
+static void handle_toplevel_close(void *data, struct xdg_toplevel *xdg_toplevel)
 {
     struct vo_wayland_state *wl = data;
     mp_input_put_key(wl->vo->input_ctx, MP_KEY_CLOSE_WIN);
 }
 
-static const struct zxdg_toplevel_v6_listener xdg_toplevel_listener = {
+static const struct xdg_toplevel_listener xdg_toplevel_listener = {
     handle_toplevel_config,
     handle_toplevel_close,
 };
 
 static int create_xdg_surface(struct vo_wayland_state *wl)
 {
-    wl->xdg_surface = zxdg_shell_v6_get_xdg_surface(wl->shell, wl->surface);
-    zxdg_surface_v6_add_listener(wl->xdg_surface, &xdg_surface_listener, wl);
+    wl->xdg_surface = xdg_wm_base_get_xdg_surface(wl->shell, wl->surface);
+    xdg_surface_add_listener(wl->xdg_surface, &xdg_surface_listener, wl);
 
-    wl->xdg_toplevel = zxdg_surface_v6_get_toplevel(wl->xdg_surface);
-    zxdg_toplevel_v6_add_listener(wl->xdg_toplevel, &xdg_toplevel_listener, wl);
+    wl->xdg_toplevel = xdg_surface_get_toplevel(wl->xdg_surface);
+    xdg_toplevel_add_listener(wl->xdg_toplevel, &xdg_toplevel_listener, wl);
 
-    zxdg_toplevel_v6_set_title (wl->xdg_toplevel, "mpv");
-    zxdg_toplevel_v6_set_app_id(wl->xdg_toplevel, "mpv");
+    xdg_toplevel_set_title (wl->xdg_toplevel, "mpv");
+    xdg_toplevel_set_app_id(wl->xdg_toplevel, "mpv");
 
     return 0;
 }
@@ -1010,7 +1014,7 @@ int vo_wayland_init(struct vo *vo)
 
     if (!wl->shell) {
         MP_FATAL(wl, "Compositor doesn't support the required %s protocol!\n",
-                 zxdg_shell_v6_interface.name);
+                 xdg_wm_base_interface.name);
         return false;
     }
 
@@ -1074,7 +1078,7 @@ void vo_wayland_uninit(struct vo *vo)
         zwp_idle_inhibit_manager_v1_destroy(wl->idle_inhibit_manager);
 
     if (wl->shell)
-        zxdg_shell_v6_destroy(wl->shell);
+        xdg_wm_base_destroy(wl->shell);
 
     if (wl->shm)
         wl_shm_destroy(wl->shm);
@@ -1153,11 +1157,14 @@ int vo_wayland_reconfig(struct vo *vo)
     vo_calc_window_geometry(vo, &screenrc, &geo);
     vo_apply_window_geometry(vo, &geo);
 
-    wl->geometry.x0  = 0;
-    wl->geometry.y0  = 0;
-    wl->geometry.x1  = vo->dwidth / wl->scaling;
-    wl->geometry.y1  = vo->dheight / wl->scaling;
-    wl->window_size  = wl->geometry;
+    if (!wl->configured || !wl->maximized) {
+        wl->geometry.x0 = 0;
+        wl->geometry.y0 = 0;
+        wl->geometry.x1 = vo->dwidth  / wl->scaling;
+        wl->geometry.y1 = vo->dheight / wl->scaling;
+        wl->window_size = wl->geometry;
+    }
+
     wl->aspect_ratio = vo->dwidth / (float)vo->dheight;
 
     if (vo->opts->fullscreen) {
@@ -1168,7 +1175,7 @@ int vo_wayland_reconfig(struct vo *vo)
             wl->geometry.x1  = mp_rect_w(wl->current_output->geometry)/wl->scaling;
             wl->geometry.y1  = mp_rect_h(wl->current_output->geometry)/wl->scaling;
         } else {
-            zxdg_toplevel_v6_set_fullscreen(wl->xdg_toplevel, wl_out);
+            xdg_toplevel_set_fullscreen(wl->xdg_toplevel, wl_out);
         }
     }
 
@@ -1198,6 +1205,7 @@ static int set_screensaver_inhibitor(struct vo_wayland_state *wl, int state)
     } else {
         MP_VERBOSE(wl, "Disabling the idle inhibitor\n");
         zwp_idle_inhibitor_v1_destroy(wl->idle_inhibitor);
+        wl->idle_inhibitor = NULL;
     }
     return VO_TRUE;
 }
@@ -1207,9 +1215,9 @@ static int toggle_fullscreen(struct vo_wayland_state *wl)
     if (!wl->xdg_toplevel)
         return VO_NOTAVAIL;
     if (wl->fullscreen)
-        zxdg_toplevel_v6_unset_fullscreen(wl->xdg_toplevel);
+        xdg_toplevel_unset_fullscreen(wl->xdg_toplevel);
     else
-        zxdg_toplevel_v6_set_fullscreen(wl->xdg_toplevel, NULL);
+        xdg_toplevel_set_fullscreen(wl->xdg_toplevel, NULL);
     return VO_TRUE;
 }
 
@@ -1217,7 +1225,7 @@ static int update_window_title(struct vo_wayland_state *wl, char *title)
 {
     if (!wl->xdg_toplevel)
         return VO_NOTAVAIL;
-    zxdg_toplevel_v6_set_title(wl->xdg_toplevel, title);
+    xdg_toplevel_set_title(wl->xdg_toplevel, title);
     return VO_TRUE;
 }
 
@@ -1318,7 +1326,7 @@ int vo_wayland_control(struct vo *vo, int *events, int request, void *arg)
     }
     case VOCTRL_SET_UNFS_WINDOW_SIZE: {
         int *s = arg;
-        if (!wl->fullscreen) {
+        if (!wl->fullscreen && !wl->maximized) {
             wl->geometry.x0 = 0;
             wl->geometry.y0 = 0;
             wl->geometry.x1 = s[0]/wl->scaling;
diff --git a/video/out/wayland_common.h b/video/out/wayland_common.h
index 4911009..9aa057f 100644
--- a/video/out/wayland_common.h
+++ b/video/out/wayland_common.h
@@ -54,6 +54,7 @@ struct vo_wayland_state {
     struct mp_rect window_size;
     float aspect_ratio;
     bool fullscreen;
+    bool maximized;
     bool configured;
     int wakeup_pipe[2];
     int pending_vo_events;
@@ -69,9 +70,9 @@ struct vo_wayland_state {
 
     /* Shell */
     struct wl_surface       *surface;
-    struct zxdg_shell_v6    *shell;
-    struct zxdg_toplevel_v6 *xdg_toplevel;
-    struct zxdg_surface_v6  *xdg_surface;
+    struct xdg_wm_base      *shell;
+    struct xdg_toplevel     *xdg_toplevel;
+    struct xdg_surface      *xdg_surface;
     struct org_kde_kwin_server_decoration_manager *server_decoration_manager;
     struct org_kde_kwin_server_decoration *server_decoration;
     struct zwp_idle_inhibit_manager_v1 *idle_inhibit_manager;
diff --git a/video/out/win_state.c b/video/out/win_state.c
index d6c8788..f9fb0ca 100644
--- a/video/out/win_state.c
+++ b/video/out/win_state.c
@@ -84,8 +84,8 @@ void vo_calc_window_geometry2(struct vo *vo, const struct mp_rect *screen,
     *out_geo = (struct vo_win_geometry){0};
 
     // The case of calling this function even though no video was configured
-    // yet (i.e. vo->params==NULL) happens when vo_opengl creates a hidden
-    // window in order to create an OpenGL context.
+    // yet (i.e. vo->params==NULL) happens when vo_gpu creates a hidden window
+    // in order to create a rendering context.
     struct mp_image_params params = { .w = 320, .h = 200 };
     if (vo->params)
         params = *vo->params;
diff --git a/video/out/x11_common.c b/video/out/x11_common.c
index 5f2c658..25325e7 100644
--- a/video/out/x11_common.c
+++ b/video/out/x11_common.c
@@ -630,7 +630,7 @@ static const struct mp_keymap keymap[] = {
     {XK_Pause, MP_KEY_PAUSE}, {XK_Escape, MP_KEY_ESC},
     {XK_BackSpace, MP_KEY_BS}, {XK_Tab, MP_KEY_TAB}, {XK_Return, MP_KEY_ENTER},
     {XK_Menu, MP_KEY_MENU}, {XK_Print, MP_KEY_PRINT},
-    {XK_Cancel, MP_KEY_CANCEL},
+    {XK_Cancel, MP_KEY_CANCEL}, {XK_ISO_Left_Tab, MP_KEY_TAB},
 
     // cursor keys
     {XK_Left, MP_KEY_LEFT}, {XK_Right, MP_KEY_RIGHT}, {XK_Up, MP_KEY_UP},
@@ -1133,6 +1133,8 @@ void vo_x11_check_events(struct vo *vo)
             mp_input_put_key(x11->input_ctx, MP_KEY_MOUSE_ENTER);
             break;
         case ButtonPress:
+            if (Event.xbutton.button - 1 >= MP_KEY_MOUSE_BTN_COUNT)
+                break;
             if (Event.xbutton.button == 1)
                 x11->win_drag_button1_down = true;
             mp_input_put_key(x11->input_ctx,
@@ -1142,6 +1144,8 @@ void vo_x11_check_events(struct vo *vo)
             vo_x11_xembed_send_message(x11, msg);
             break;
         case ButtonRelease:
+            if (Event.xbutton.button - 1 >= MP_KEY_MOUSE_BTN_COUNT)
+                break;
             if (Event.xbutton.button == 1)
                 x11->win_drag_button1_down = false;
             mp_input_put_key(x11->input_ctx,
author	James Cowgill <jcowgill@debian.org>	2018-07-27 14:24:34 +0800
committer	James Cowgill <jcowgill@debian.org>	2018-07-27 14:24:34 +0800
commit	f4faf74f8747c113bd8c1f99e6b6fb1983f11e0d (patch)
tree	a9888a5b34d33fa31cc656c856d81333aa0e3ab3 /video/out
parent	d96cb5fac5258f82733a6e26aa212939f2ce991d (diff)