1 files changed, 134 insertions, 18 deletions
diff --git a/cmd/restic/lock.go b/cmd/restic/lock.go
index f39a08db6..11c1ed8f5 100644
--- a/cmd/restic/lock.go
+++ b/cmd/restic/lock.go
@@ -2,6 +2,7 @@ package main
 
 import (
 	"context"
+	"fmt"
 	"sync"
 	"time"
 
@@ -11,6 +12,7 @@ import (
 )
 
 type lockContext struct {
+	lock      *restic.Lock
 	cancel    context.CancelFunc
 	refreshWG sync.WaitGroup
 }
@@ -21,17 +23,29 @@ var globalLocks struct {
 	sync.Once
 }
 
-func lockRepo(ctx context.Context, repo restic.Repository) (*restic.Lock, context.Context, error) {
-	return lockRepository(ctx, repo, false)
+func lockRepo(ctx context.Context, repo restic.Repository, retryLock time.Duration, json bool) (*restic.Lock, context.Context, error) {
+	return lockRepository(ctx, repo, false, retryLock, json)
 }
 
-func lockRepoExclusive(ctx context.Context, repo restic.Repository) (*restic.Lock, context.Context, error) {
-	return lockRepository(ctx, repo, true)
+func lockRepoExclusive(ctx context.Context, repo restic.Repository, retryLock time.Duration, json bool) (*restic.Lock, context.Context, error) {
+	return lockRepository(ctx, repo, true, retryLock, json)
+}
+
+var (
+	retrySleepStart = 5 * time.Second
+	retrySleepMax   = 60 * time.Second
+)
+
+func minDuration(a, b time.Duration) time.Duration {
+	if a <= b {
+		return a
+	}
+	return b
 }
 
 // lockRepository wraps the ctx such that it is cancelled when the repository is unlocked
 // cancelling the original context also stops the lock refresh
-func lockRepository(ctx context.Context, repo restic.Repository, exclusive bool) (*restic.Lock, context.Context, error) {
+func lockRepository(ctx context.Context, repo restic.Repository, exclusive bool, retryLock time.Duration, json bool) (*restic.Lock, context.Context, error) {
 	// make sure that a repository is unlocked properly and after cancel() was
 	// called by the cleanup handler in global.go
 	globalLocks.Do(func() {
@@ -43,26 +57,65 @@ func lockRepository(ctx context.Context, repo restic.Repository, exclusive bool)
 		lockFn = restic.NewExclusiveLock
 	}
 
-	lock, err := lockFn(ctx, repo)
+	var lock *restic.Lock
+	var err error
+
+	retrySleep := minDuration(retrySleepStart, retryLock)
+	retryMessagePrinted := false
+	retryTimeout := time.After(retryLock)
+
+retryLoop:
+	for {
+		lock, err = lockFn(ctx, repo)
+		if err != nil && restic.IsAlreadyLocked(err) {
+
+			if !retryMessagePrinted {
+				if !json {
+					Verbosef("repo already locked, waiting up to %s for the lock\n", retryLock)
+				}
+				retryMessagePrinted = true
+			}
+
+			debug.Log("repo already locked, retrying in %v", retrySleep)
+			retrySleepCh := time.After(retrySleep)
+
+			select {
+			case <-ctx.Done():
+				return nil, ctx, ctx.Err()
+			case <-retryTimeout:
+				debug.Log("repo already locked, timeout expired")
+				// Last lock attempt
+				lock, err = lockFn(ctx, repo)
+				break retryLoop
+			case <-retrySleepCh:
+				retrySleep = minDuration(retrySleep*2, retrySleepMax)
+			}
+		} else {
+			// anything else, either a successful lock or another error
+			break retryLoop
+		}
+	}
 	if restic.IsInvalidLock(err) {
 		return nil, ctx, errors.Fatalf("%v\n\nthe `unlock --remove-all` command can be used to remove invalid locks. Make sure that no other restic process is accessing the repository when running the command", err)
 	}
 	if err != nil {
-		return nil, ctx, errors.Fatalf("unable to create lock in backend: %v", err)
+		return nil, ctx, fmt.Errorf("unable to create lock in backend: %w", err)
 	}
 	debug.Log("create lock %p (exclusive %v)", lock, exclusive)
 
 	ctx, cancel := context.WithCancel(ctx)
 	lockInfo := &lockContext{
+		lock:   lock,
 		cancel: cancel,
 	}
 	lockInfo.refreshWG.Add(2)
 	refreshChan := make(chan struct{})
+	forceRefreshChan := make(chan refreshLockRequest)
 
 	globalLocks.Lock()
 	globalLocks.locks[lock] = lockInfo
-	go refreshLocks(ctx, lock, lockInfo, refreshChan)
-	go monitorLockRefresh(ctx, lock, lockInfo, refreshChan)
+	go refreshLocks(ctx, repo.Backend(), lockInfo, refreshChan, forceRefreshChan)
+	go monitorLockRefresh(ctx, lockInfo, refreshChan, forceRefreshChan)
 	globalLocks.Unlock()
 
 	return lock, ctx, err
@@ -74,8 +127,13 @@ var refreshInterval = 5 * time.Minute
 // the difference allows to compensate for a small time drift between clients.
 var refreshabilityTimeout = restic.StaleLockTimeout - refreshInterval*3/2
 
-func refreshLocks(ctx context.Context, lock *restic.Lock, lockInfo *lockContext, refreshed chan<- struct{}) {
+type refreshLockRequest struct {
+	result chan bool
+}
+
+func refreshLocks(ctx context.Context, backend restic.Backend, lockInfo *lockContext, refreshed chan<- struct{}, forceRefresh <-chan refreshLockRequest) {
 	debug.Log("start")
+	lock := lockInfo.lock
 	ticker := time.NewTicker(refreshInterval)
 	lastRefresh := lock.Time
 
@@ -99,6 +157,22 @@ func refreshLocks(ctx context.Context, lock *restic.Lock, lockInfo *lockContext,
 		case <-ctx.Done():
 			debug.Log("terminate")
 			return
+
+		case req := <-forceRefresh:
+			debug.Log("trying to refresh stale lock")
+			// keep on going if our current lock still exists
+			success := tryRefreshStaleLock(ctx, backend, lock, lockInfo.cancel)
+			// inform refresh goroutine about forced refresh
+			select {
+			case <-ctx.Done():
+			case req.result <- success:
+			}
+
+			if success {
+				// update lock refresh time
+				lastRefresh = lock.Time
+			}
+
 		case <-ticker.C:
 			if time.Since(lastRefresh) > refreshabilityTimeout {
 				// the lock is too old, wait until the expiry monitor cancels the context
@@ -111,7 +185,7 @@ func refreshLocks(ctx context.Context, lock *restic.Lock, lockInfo *lockContext,
 				Warnf("unable to refresh lock: %v\n", err)
 			} else {
 				lastRefresh = lock.Time
-				// inform monitor gorountine about successful refresh
+				// inform monitor goroutine about successful refresh
 				select {
 				case <-ctx.Done():
 				case refreshed <- struct{}{}:
@@ -121,7 +195,7 @@ func refreshLocks(ctx context.Context, lock *restic.Lock, lockInfo *lockContext,
 	}
 }
 
-func monitorLockRefresh(ctx context.Context, lock *restic.Lock, lockInfo *lockContext, refreshed <-chan struct{}) {
+func monitorLockRefresh(ctx context.Context, lockInfo *lockContext, refreshed <-chan struct{}, forceRefresh chan<- refreshLockRequest) {
 	// time.Now() might use a monotonic timer which is paused during standby
 	// convert to unix time to ensure we compare real time values
 	lastRefresh := time.Now().UnixNano()
@@ -133,24 +207,47 @@ func monitorLockRefresh(ctx context.Context, lock *restic.Lock, lockInfo *lockCo
 	// timers are paused during standby, which is a problem as the refresh timeout
 	// _must_ expire if the host was too long in standby. Thus fall back to periodic checks
 	// https://github.com/golang/go/issues/35012
-	timer := time.NewTimer(pollDuration)
+	ticker := time.NewTicker(pollDuration)
 	defer func() {
-		timer.Stop()
+		ticker.Stop()
 		lockInfo.cancel()
 		lockInfo.refreshWG.Done()
 	}()
 
+	var refreshStaleLockResult chan bool
+
 	for {
 		select {
 		case <-ctx.Done():
 			debug.Log("terminate expiry monitoring")
 			return
 		case <-refreshed:
+			if refreshStaleLockResult != nil {
+				// ignore delayed refresh notifications while the stale lock is refreshed
+				continue
+			}
 			lastRefresh = time.Now().UnixNano()
-		case <-timer.C:
-			if time.Now().UnixNano()-lastRefresh < refreshabilityTimeout.Nanoseconds() {
-				// restart timer
-				timer.Reset(pollDuration)
+		case <-ticker.C:
+			if time.Now().UnixNano()-lastRefresh < refreshabilityTimeout.Nanoseconds() || refreshStaleLockResult != nil {
+				continue
+			}
+
+			debug.Log("trying to refreshStaleLock")
+			// keep on going if our current lock still exists
+			refreshReq := refreshLockRequest{
+				result: make(chan bool),
+			}
+			refreshStaleLockResult = refreshReq.result
+
+			// inform refresh goroutine about forced refresh
+			select {
+			case <-ctx.Done():
+			case forceRefresh <- refreshReq:
+			}
+		case success := <-refreshStaleLockResult:
+			if success {
+				lastRefresh = time.Now().UnixNano()
+				refreshStaleLockResult = nil
 				continue
 			}
 
@@ -160,6 +257,25 @@ func monitorLockRefresh(ctx context.Context, lock *restic.Lock, lockInfo *lockCo
 	}
 }
 
+func tryRefreshStaleLock(ctx context.Context, backend restic.Backend, lock *restic.Lock, cancel context.CancelFunc) bool {
+	freeze := restic.AsBackend[restic.FreezeBackend](backend)
+	if freeze != nil {
+		debug.Log("freezing backend")
+		freeze.Freeze()
+		defer freeze.Unfreeze()
+	}
+
+	err := lock.RefreshStaleLock(ctx)
+	if err != nil {
+		Warnf("failed to refresh stale lock: %v\n", err)
+		// cancel context while the backend is still frozen to prevent accidental modifications
+		cancel()
+		return false
+	}
+
+	return true
+}
+
 func unlockRepo(lock *restic.Lock) {
 	if lock == nil {
 		return