summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoey Hess <joeyh@joeyh.name>2019-11-14 11:31:43 -0400
committerJoey Hess <joeyh@joeyh.name>2019-11-14 13:51:09 -0400
commit667d38a8f11c1ee8f256cdbd80e225c2bae06595 (patch)
tree5c8e902aad7639b5393e5073d53f81d7c75d9240
parent20d9a9b662dbf35a670cdce534d8784a2e0341e7 (diff)
Fix a crash (STM deadlock) when -J is used with multiple files that point to the same key
See the comment for a trace of the deadlock. Added a new StartStage. New worker threads begin in the StartStage. Once a thread is ready to do work, it moves away from the StartStage, and no thread will ever transition back to it. A thread that blocks waiting on another thread that is processing the same key will block while in the StartStage. That other thread will never switch back to the StartStage, and so the deadlock is avoided.
-rw-r--r--Annex/Concurrent.hs40
-rw-r--r--CHANGELOG2
-rw-r--r--CmdLine/Action.hs11
-rw-r--r--Types/WorkerPool.hs14
-rw-r--r--doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction.mdwn2
-rw-r--r--doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction/comment_6_e33df0ec76069deb069b94c944d62c76._comment69
6 files changed, 114 insertions, 24 deletions
diff --git a/Annex/Concurrent.hs b/Annex/Concurrent.hs
index 4626a9294f..1ff8e0c730 100644
--- a/Annex/Concurrent.hs
+++ b/Annex/Concurrent.hs
@@ -90,10 +90,20 @@ enteringStage newstage a = Annex.getState Annex.workers >>= \case
Nothing -> a
Just tv -> do
mytid <- liftIO myThreadId
- let set = changeStageTo mytid tv newstage
- let restore = maybe noop (void . changeStageTo mytid tv)
+ let set = changeStageTo mytid tv (const newstage)
+ let restore = maybe noop (void . changeStageTo mytid tv . const)
bracket set restore (const a)
+{- Transition the current thread to the initial stage.
+ - This is done once the thread is ready to begin work.
+ -}
+enteringInitialStage :: Annex ()
+enteringInitialStage = Annex.getState Annex.workers >>= \case
+ Nothing -> noop
+ Just tv -> do
+ mytid <- liftIO myThreadId
+ void $ changeStageTo mytid tv initialStage
+
{- This needs to leave the WorkerPool with the same number of
- idle and active threads, and with the same number of threads for each
- WorkerStage. So, all it can do is swap the WorkerStage of our thread's
@@ -110,14 +120,15 @@ enteringStage newstage a = Annex.getState Annex.workers >>= \case
- in the pool than spareVals. That does not prevent other threads that call
- this from using them though, so it's fine.
-}
-changeStageTo :: ThreadId -> TMVar (WorkerPool AnnexState) -> WorkerStage -> Annex (Maybe WorkerStage)
-changeStageTo mytid tv newstage = liftIO $
+changeStageTo :: ThreadId -> TMVar (WorkerPool AnnexState) -> (UsedStages -> WorkerStage) -> Annex (Maybe WorkerStage)
+changeStageTo mytid tv getnewstage = liftIO $
replaceidle >>= maybe
(return Nothing)
(either waitidle (return . Just))
where
replaceidle = atomically $ do
pool <- takeTMVar tv
+ let newstage = getnewstage (usedStages pool)
let notchanging = do
putTMVar tv pool
return Nothing
@@ -128,7 +139,7 @@ changeStageTo mytid tv newstage = liftIO $
Nothing -> do
putTMVar tv $
addWorkerPool (IdleWorker oldstage) pool'
- return $ Just $ Left (myaid, oldstage)
+ return $ Just $ Left (myaid, newstage, oldstage)
Just pool'' -> do
-- optimisation
putTMVar tv $
@@ -139,27 +150,26 @@ changeStageTo mytid tv newstage = liftIO $
_ -> notchanging
else notchanging
- waitidle (myaid, oldstage) = atomically $ do
+ waitidle (myaid, newstage, oldstage) = atomically $ do
pool <- waitIdleWorkerSlot newstage =<< takeTMVar tv
putTMVar tv $ addWorkerPool (ActiveWorker myaid newstage) pool
return (Just oldstage)
--- | Waits until there's an idle worker in the worker pool
--- for its initial stage, removes it from the pool, and returns its state.
+-- | Waits until there's an idle StartStage worker in the worker pool,
+-- removes it from the pool, and returns its state.
--
-- If the worker pool is not already allocated, returns Nothing.
-waitInitialWorkerSlot :: TMVar (WorkerPool Annex.AnnexState) -> STM (Maybe (Annex.AnnexState, WorkerStage))
-waitInitialWorkerSlot tv = do
+waitStartWorkerSlot :: TMVar (WorkerPool Annex.AnnexState) -> STM (Maybe (Annex.AnnexState, WorkerStage))
+waitStartWorkerSlot tv = do
pool <- takeTMVar tv
- let stage = initialStage (usedStages pool)
- st <- go stage pool
- return $ Just (st, stage)
+ st <- go pool
+ return $ Just (st, StartStage)
where
- go wantstage pool = case spareVals pool of
+ go pool = case spareVals pool of
[] -> retry
(v:vs) -> do
let pool' = pool { spareVals = vs }
- putTMVar tv =<< waitIdleWorkerSlot wantstage pool'
+ putTMVar tv =<< waitIdleWorkerSlot StartStage pool'
return v
waitIdleWorkerSlot :: WorkerStage -> WorkerPool Annex.AnnexState -> STM (WorkerPool Annex.AnnexState)
diff --git a/CHANGELOG b/CHANGELOG
index 03d8e7b118..c6a628a6e0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -3,6 +3,8 @@ git-annex (7.20191107) UNRELEASED; urgency=medium
* Added annex.allowsign option.
* Make --json-error-messages capture more errors,
particularly url download errors.
+ * Fix a crash (STM deadlock) when -J is used with multiple files
+ that point to the same key.
-- Joey Hess <id@joeyh.name> Mon, 11 Nov 2019 15:59:47 -0400
diff --git a/CmdLine/Action.hs b/CmdLine/Action.hs
index 87298a95ff..67a7618e4c 100644
--- a/CmdLine/Action.hs
+++ b/CmdLine/Action.hs
@@ -63,7 +63,7 @@ commandAction start = Annex.getState Annex.concurrency >>= \case
runconcurrent = Annex.getState Annex.workers >>= \case
Nothing -> runnonconcurrent
Just tv ->
- liftIO (atomically (waitInitialWorkerSlot tv)) >>=
+ liftIO (atomically (waitStartWorkerSlot tv)) >>=
maybe runnonconcurrent (runconcurrent' tv)
runconcurrent' tv (workerst, workerstage) = do
aid <- liftIO $ async $ snd <$> Annex.run workerst
@@ -99,12 +99,13 @@ commandAction start = Annex.getState Annex.concurrency >>= \case
case mkActionItem startmsg' of
OnlyActionOn k' _ | k' /= k ->
concurrentjob' workerst startmsg' perform'
- _ -> mkjob workerst startmsg' perform'
+ _ -> beginjob workerst startmsg' perform'
Nothing -> noop
- _ -> mkjob workerst startmsg perform
+ _ -> beginjob workerst startmsg perform
- mkjob workerst startmsg perform =
- inOwnConsoleRegion (Annex.output workerst) $
+ beginjob workerst startmsg perform =
+ inOwnConsoleRegion (Annex.output workerst) $ do
+ enteringInitialStage
void $ accountCommandAction startmsg $
performconcurrent startmsg perform
diff --git a/Types/WorkerPool.hs b/Types/WorkerPool.hs
index 178c30166c..8a68163138 100644
--- a/Types/WorkerPool.hs
+++ b/Types/WorkerPool.hs
@@ -40,7 +40,12 @@ instance Show (Worker t) where
show (ActiveWorker _ s) = "ActiveWorker " ++ show s
data WorkerStage
- = PerformStage
+ = StartStage
+ -- ^ All threads start in this stage, and then transition away from
+ -- it to the initialStage when they begin doing work. This should
+ -- never be included in UsedStages, because transition from some
+ -- other stage back to this one could result in a deadlock.
+ | PerformStage
-- ^ Running a CommandPerform action.
| CleanupStage
-- ^ Running a CommandCleanup action.
@@ -102,12 +107,13 @@ workerAsync (ActiveWorker aid _) = Just aid
allocateWorkerPool :: t -> Int -> UsedStages -> WorkerPool t
allocateWorkerPool t n u = WorkerPool
{ usedStages = u
- , workerList = take totalthreads $ map IdleWorker stages
+ , workerList = map IdleWorker $
+ take totalthreads $ concat $ repeat stages
, spareVals = replicate totalthreads t
}
where
- stages = concat $ repeat $ S.toList $ stageSet u
- totalthreads = n * S.size (stageSet u)
+ stages = StartStage : S.toList (stageSet u)
+ totalthreads = n * length stages
addWorkerPool :: Worker t -> WorkerPool t -> WorkerPool t
addWorkerPool w pool = pool { workerList = w : workerList pool }
diff --git a/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction.mdwn b/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction.mdwn
index 837cfecb1c..1aa191ed88 100644
--- a/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction.mdwn
+++ b/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction.mdwn
@@ -69,3 +69,5 @@ I felt like it is an old issue but failed to find a trace of it upon a quick loo
[[!meta author=yoh]]
[[!tag projects/datalad]]
+
+> [[fixed|done]] --[[Joey]]
diff --git a/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction/comment_6_e33df0ec76069deb069b94c944d62c76._comment b/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction/comment_6_e33df0ec76069deb069b94c944d62c76._comment
new file mode 100644
index 0000000000..7f348fc9f5
--- /dev/null
+++ b/doc/bugs/parallel_get_to_the_files_for_the_same_key_would_fail_with__thread_blocked_indefinitely_in_an_STM_transaction/comment_6_e33df0ec76069deb069b94c944d62c76._comment
@@ -0,0 +1,69 @@
+[[!comment format=mdwn
+ username="joey"
+ subject="""comment 6"""
+ date="2019-11-14T15:20:13Z"
+ content="""
+Added tracing of changes to the WorkerPool.
+
+ joey@darkstar:/tmp/dst>git annex get -J1 1 2 --json
+ ("initial pool",WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [IdleWorker TransferStage,IdleWorker VerifyStage] 2)
+ ("starting worker",WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [ActiveWorker TransferStage,IdleWorker VerifyStage] 1)
+
+Transfer starts for file 1
+
+ (("change stage from",TransferStage,"to",VerifyStage),WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [IdleWorker TransferStage,ActiveWorker VerifyStage] 1)
+
+Transfer complete, verifying starts.
+
+ ("starting worker",WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [ActiveWorker TransferStage,ActiveWorker VerifyStage] 0)
+
+This second thread is being started to process file 2.
+It starts in TransferStage, but it will be blocked from doing anything
+by ensureOnlyActionOn.
+
+ ("finishCommandActions starts with",WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [ActiveWorker TransferStage,ActiveWorker VerifyStage] 0)
+ ("finishCommandActions observes",WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [ActiveWorker TransferStage,ActiveWorker VerifyStage] 0)
+
+All files have threads to process them started, so finishCommandActions starts up.
+It will retry since the threads are still running.
+
+ (("change stage from",VerifyStage,"to",TransferStage),WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [IdleWorker VerifyStage,ActiveWorker TransferStage] 0)
+
+The first thread is done with verification, and
+the stage is being restored to transfer.
+
+The 0 means that there are 0 spareVals. Normally, the number of spareVals
+should be the same as the number of IdleWorkers, so it should be 1.
+It's 0 because the thread is in the process of changing between stages.
+
+The thread should at this point be waiting for an idle TransferStage
+slot to become available. The second thread still has that active.
+It seems that wait never completes, because a trace I had after that wait
+never got printed.
+
+ ("finishCommandActions observes",WorkerPool UsedStages {initialStage = TransferStage, stageSet = fromList [TransferStage,VerifyStage]} [IdleWorker VerifyStage,ActiveWorker TransferStage] 0)
+
+It retries again, because of the active worker and also because spareVals
+is not the same as IdleWorkers.
+
+ git-annex: thread blocked indefinitely in an STM transaction
+
+Deadlock.
+
+Looks like that second thread that got into transfer stage
+never leaves it, and then the first thread, which wants to
+restore back to transfer stage, is left waiting forever for it. And so is
+finishCommandActions.
+
+Aha! The second thread is in fact still in ensureOnlyActionOn.
+So it's waiting on the first thread to finish. But the first thread can't
+transition back to TransferStage because the second thread has stolen it.
+
+Now it makes sense.
+
+So.. One way to fix this would be to add a new stage, which is used for
+threads that are just starting. Then the second thread would be in
+StartStage, and the first thread would not be prevented from transitioning
+back to TransferStage. Would need to make sure that, once a thread leaves
+StartStage, it does not ever transition back to it.
+"""]]