summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoey Hess <joeyh@joeyh.name>2023-12-07 15:50:52 -0400
committerJoey Hess <joeyh@joeyh.name>2023-12-07 15:50:52 -0400
commitf1ce15036fbcebeb26e52e61eea44e457434f065 (patch)
tree6f0171fa8e342212c87878ae010610a69ada2d40
parentd06aee7ce0ccd60b51a2ede2694ea5e4330efc0d (diff)
started migrate --update
This is most of the way there, but not quite working. The layout of migrate.tree/ needs to be changed to follow this approach. git log will list all the files in tree order, so the new layout needs to alternate old and new keys. Can that be done? git may not document tree order, or may not preserve it here. Alternatively, change to using git log --format=raw and extract the tree header from that, then use git diff --raw $tree:migrate.tree/old $tree:migrate.tree/new That will be a little more expensive, but only when there are lots of migrations. Sponsored-by: Joshua Antonishen on Patreon
-rw-r--r--Annex/Branch.hs16
-rw-r--r--Annex/Locations.hs9
-rw-r--r--CHANGELOG2
-rw-r--r--Command/Log.hs9
-rw-r--r--Command/Migrate.hs23
-rw-r--r--Git/Log.hs40
-rw-r--r--Logs/Export.hs2
-rw-r--r--Logs/Migrate.hs95
-rw-r--r--doc/git-annex-migrate.mdwn14
9 files changed, 172 insertions, 38 deletions
diff --git a/Annex/Branch.hs b/Annex/Branch.hs
index f61accfc4f..cbd54f47a9 100644
--- a/Annex/Branch.hs
+++ b/Annex/Branch.hs
@@ -14,6 +14,7 @@ module Annex.Branch (
hasSibling,
siblingBranches,
create,
+ getBranch,
UpdateMade(..),
update,
forceUpdate,
@@ -120,7 +121,7 @@ siblingBranches = inRepo $ Git.Ref.matchingUniq [name]
create :: Annex ()
create = void getBranch
-{- Returns the ref of the branch, creating it first if necessary. -}
+{- Returns the sha of the branch, creating it first if necessary. -}
getBranch :: Annex Git.Ref
getBranch = maybe (hasOrigin >>= go >>= use) return =<< branchsha
where
@@ -920,10 +921,14 @@ getMergedRefs' = do
{- Grafts a treeish into the branch at the specified location,
- and then removes it. This ensures that the treeish won't get garbage
- collected, and will always be available as long as the git-annex branch
- - is available. -}
-rememberTreeish :: Git.Ref -> TopFilePath -> Annex ()
-rememberTreeish treeish graftpoint = lockJournal $ rememberTreeishLocked treeish graftpoint
-rememberTreeishLocked :: Git.Ref -> TopFilePath -> JournalLocked -> Annex ()
+ - is available.
+ -
+ - Returns the sha of the git commit made to the git-annex branch.
+ -}
+rememberTreeish :: Git.Ref -> TopFilePath -> Annex Git.Sha
+rememberTreeish treeish graftpoint = lockJournal $
+ rememberTreeishLocked treeish graftpoint
+rememberTreeishLocked :: Git.Ref -> TopFilePath -> JournalLocked -> Annex Git.Sha
rememberTreeishLocked treeish graftpoint jl = do
branchref <- getBranch
updateIndex jl branchref
@@ -940,6 +945,7 @@ rememberTreeishLocked treeish graftpoint jl = do
-- and the index was updated to that above, so it's safe to
-- say that the index contains c'.
setIndexSha c'
+ return c'
{- Runs an action on the content of selected files from the branch.
- This is much faster than reading the content of each file in turn,
diff --git a/Annex/Locations.hs b/Annex/Locations.hs
index bd94070423..6e4f0faa2a 100644
--- a/Annex/Locations.hs
+++ b/Annex/Locations.hs
@@ -56,6 +56,8 @@ module Annex.Locations (
gitAnnexAdjustedBranchUpdateLock,
gitAnnexMigrateLog,
gitAnnexMigrateLock,
+ gitAnnexMigrationsLog,
+ gitAnnexMigrationsLock,
gitAnnexMoveLog,
gitAnnexMoveLock,
gitAnnexExportDir,
@@ -416,6 +418,13 @@ gitAnnexMigrateLog r = gitAnnexDir r P.</> "migrate.log"
gitAnnexMigrateLock :: Git.Repo -> RawFilePath
gitAnnexMigrateLock r = gitAnnexDir r P.</> "migrate.lck"
+{- .git/annex/migrations.log is used to log committed migrations. -}
+gitAnnexMigrationsLog :: Git.Repo -> RawFilePath
+gitAnnexMigrationsLog r = gitAnnexDir r P.</> "migrations.log"
+
+gitAnnexMigrationsLock :: Git.Repo -> RawFilePath
+gitAnnexMigrationsLock r = gitAnnexDir r P.</> "migrations.lck"
+
{- .git/annex/move.log is used to log moves that are in progress,
- to better support resuming an interrupted move. -}
gitAnnexMoveLog :: Git.Repo -> RawFilePath
diff --git a/CHANGELOG b/CHANGELOG
index ab8e557dca..5a2281bf81 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,7 @@
git-annex (10.20231130) UNRELEASED; urgency=medium
+ * migrate: Support distributed migrations by recording each migration,
+ and adding a --update option that updates the local repository.
* Make git-annex get/copy/move --from foo override configuration of
remote.foo.annex-ignore, as documented.
* Support git-annex copy/move --from-anywhere --to remote.
diff --git a/Command/Log.hs b/Command/Log.hs
index e7f4de1bcf..54f56e1002 100644
--- a/Command/Log.hs
+++ b/Command/Log.hs
@@ -290,8 +290,9 @@ getKeyLog key os = do
getGitLogAnnex :: [FilePath] -> [CommandParam] -> Annex ([RefChange Key], IO Bool)
getGitLogAnnex fs os = do
config <- Annex.getGitConfig
- let fileselector = locationLogFileKey config . toRawFilePath
- inRepo $ getGitLog Annex.Branch.fullname fs os fileselector
+ let fileselector = \_sha f ->
+ locationLogFileKey config (toRawFilePath f)
+ inRepo $ getGitLog Annex.Branch.fullname Nothing fs os fileselector
showTimeStamp :: TimeZone -> String -> POSIXTime -> String
showTimeStamp zone format = formatTime defaultTimeLocale format
@@ -321,13 +322,13 @@ sizeHistoryInfo mu o = do
-- and to the trust log.
getlog = do
config <- Annex.getGitConfig
- let fileselector = \f -> let f' = toRawFilePath f in
+ let fileselector = \_sha f -> let f' = toRawFilePath f in
case locationLogFileKey config f' of
Just k -> Just (Right k)
Nothing
| f' == trustLog -> Just (Left ())
| otherwise -> Nothing
- inRepo $ getGitLog Annex.Branch.fullname []
+ inRepo $ getGitLog Annex.Branch.fullname Nothing []
[ Param "--date-order"
, Param "--reverse"
]
diff --git a/Command/Migrate.hs b/Command/Migrate.hs
index 6755680d9d..614eab858a 100644
--- a/Command/Migrate.hs
+++ b/Command/Migrate.hs
@@ -28,6 +28,7 @@ cmd = withAnnexOptions [backendOption, annexedMatchingOptions, jsonOptions] $
data MigrateOptions = MigrateOptions
{ migrateThese :: CmdParams
+ , updateOption :: Bool
, removeSize :: Bool
}
@@ -35,14 +36,23 @@ optParser :: CmdParamsDesc -> Parser MigrateOptions
optParser desc = MigrateOptions
<$> cmdParams desc
<*> switch
+ ( long "update"
+ <> help "update for migrations performed elsewhere"
+ )
+ <*> switch
( long "remove-size"
<> help "remove size field from keys"
)
seek :: MigrateOptions -> CommandSeek
-seek o = do
- withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
- commitMigration
+seek o
+ | updateOption o = do
+ unless (null (migrateThese o)) $
+ error "Cannot combine --update with files to migrate."
+ commandAction update
+ | otherwise = do
+ withFilesInGitAnnex ww seeker =<< workTreeItems ww (migrateThese o)
+ commitMigration
where
ww = WarnUnmatchLsFiles "migrate"
seeker = AnnexedFileSeeker
@@ -131,3 +141,10 @@ perform onlyremovesize o file oldkey oldkeyrec oldbackend newbackend = go =<< ge
| removeSize o = alterKey k $ \kd -> kd { keySize = Nothing }
| otherwise = k
afile = AssociatedFile (Just file)
+
+update :: CommandStart
+update = starting "migrate" (ActionItemOther Nothing) (SeekInput []) $ do
+ streamNewDistributedMigrations $ \oldkey newkey -> do
+ liftIO $ print ("migrate", oldkey, newkey)
+ next $ return True
+
diff --git a/Git/Log.hs b/Git/Log.hs
index 83bd4eee35..1b3c03b812 100644
--- a/Git/Log.hs
+++ b/Git/Log.hs
@@ -10,6 +10,7 @@ module Git.Log where
import Common
import Git
import Git.Command
+import Git.Sha
import Data.Time
import Data.Time.Clock.POSIX
@@ -28,14 +29,15 @@ data RefChange t = RefChange
-- run after processing the returned list.
getGitLog
:: Ref
+ -> Maybe Ref
-> [FilePath]
-> [CommandParam]
- -> (FilePath -> Maybe t)
+ -> (Sha -> FilePath -> Maybe t)
-> Repo
-> IO ([RefChange t], IO Bool)
-getGitLog ref fs os fileselector repo = do
+getGitLog ref stopref fs os selector repo = do
(ls, cleanup) <- pipeNullSplit ps repo
- return (parseGitRawLog fileselector (map decodeBL ls), cleanup)
+ return (parseGitRawLog selector (map decodeBL ls), cleanup)
where
ps =
[ Param "log"
@@ -45,14 +47,16 @@ getGitLog ref fs os fileselector repo = do
, Param "--no-abbrev"
, Param "--no-renames"
] ++ os ++
- [ Param (fromRef ref)
+ [ case stopref of
+ Just stopref' -> Param $
+ fromRef stopref' <> ".." <> fromRef ref
+ Nothing -> Param (fromRef ref)
, Param "--"
] ++ map Param fs
--- The commitinfo is the timestamp of the commit, followed by
--- the commit hash and then the commit's parents, separated by spaces.
+-- The commitinfo is the commit hash followed by its timestamp.
commitinfoFormat :: String
-commitinfoFormat = "%ct"
+commitinfoFormat = "%H %ct"
-- Parses chunked git log --raw output generated by getGitLog,
-- which looks something like:
@@ -69,21 +73,23 @@ commitinfoFormat = "%ct"
--
-- The commitinfo is not included before all changelines, so
-- keep track of the most recently seen commitinfo.
-parseGitRawLog :: (FilePath -> Maybe t) -> [String] -> [RefChange t]
-parseGitRawLog fileselector = parse epoch
+parseGitRawLog :: (Ref -> FilePath -> Maybe t) -> [String] -> [RefChange t]
+parseGitRawLog selector = parse (deleteSha, epoch)
where
epoch = toEnum 0 :: POSIXTime
- parse oldts ([]:rest) = parse oldts rest
- parse oldts (c1:c2:rest) = case mrc of
- Just rc -> rc : parse ts rest
- Nothing -> parse ts (c2:rest)
+ parse old ([]:rest) = parse old rest
+ parse (oldcommitsha, oldts) (c1:c2:rest) = case mrc of
+ Just rc -> rc : parse (commitsha, ts) rest
+ Nothing -> parse (commitsha, ts) (c2:rest)
where
- (ts, cl) = case separate (== '\n') c1 of
- (cl', []) -> (oldts, cl')
- (tss, cl') -> (parseTimeStamp tss, cl')
+ (commitsha, ts, cl) = case separate (== '\n') c1 of
+ (cl', []) -> (oldcommitsha, oldts, cl')
+ (ci, cl') -> case words ci of
+ (css:tss:[]) -> (Ref (encodeBS css), parseTimeStamp tss, cl')
+ _ -> (oldcommitsha, oldts, cl')
mrc = do
(old, new) <- parseRawChangeLine cl
- v <- fileselector c2
+ v <- selector commitsha c2
return $ RefChange
{ changetime = ts
, changed = v
diff --git a/Logs/Export.hs b/Logs/Export.hs
index a405534623..36d6156c75 100644
--- a/Logs/Export.hs
+++ b/Logs/Export.hs
@@ -78,7 +78,7 @@ recordExportBeginning remoteuuid newtree = do
-- repository, the tree has to be kept available, even if it
-- doesn't end up being merged into the master branch.
recordExportTreeish :: Git.Ref -> Annex ()
-recordExportTreeish t =
+recordExportTreeish t = void $
Annex.Branch.rememberTreeish t (asTopFilePath exportTreeGraftPoint)
-- | Record that an export to a special remote is under way.
diff --git a/Logs/Migrate.hs b/Logs/Migrate.hs
index a527cc64ab..f6444f1ee7 100644
--- a/Logs/Migrate.hs
+++ b/Logs/Migrate.hs
@@ -16,7 +16,19 @@
- The filenames used in the trees are not the original filenames, to avoid
- running migrate in a throwaway branch unexpectedly recording that
- branch's contents.
--
+ -
+ - There are two local log files:
+ - * migrate.log contains pairs of old and new keys, and is used while
+ - performing a new migration, to build up a migration to commit.
+ - This allows an interrupted migration to be resumed later.
+ - * migrations.log has as its first line a commit to the git-annex branch
+ - up to which all migrations have been performed locally (including any
+ - migrations in parent commits). Or the first line may be a null sha when
+ - this has not been done yet. The rest of the lines in the file
+ - are commits that have been made for locally performed migrations,
+ - but whose parent commits have not necessarily been checked for
+ - migrations yet.
+ -
- Copyright 2023 Joey Hess <id@joeyh.name>
-
- Licensed under the GNU AGPL version 3 or higher.
@@ -28,6 +40,7 @@ module Logs.Migrate (
MigrationRecord(..),
logMigration,
commitMigration,
+ streamNewDistributedMigrations,
) where
import Annex.Common
@@ -39,6 +52,9 @@ import Git.Tree
import Git.FilePath
import Logs.File
import Logs
+import Git.Log
+import Git.Sha
+import Annex.CatFile
import qualified Data.ByteString.Lazy as L
import Control.Concurrent.STM
@@ -48,9 +64,6 @@ import Control.Concurrent.STM
newtype MigrationRecord = MigrationRecord { fromMigrationRecord :: Git.Sha }
-- | Logs a migration from an old to a new key.
---
--- This writes to a log file, which can later be committed. That allows an
--- interrupted migration to be resumed later.
logMigration :: MigrationRecord -> MigrationRecord -> Annex ()
logMigration old new = do
logf <- fromRepo gitAnnexMigrateLog
@@ -97,5 +110,77 @@ commitMigration = do
[ RecordedSubTree (asTopFilePath "old") oldt []
, RecordedSubTree (asTopFilePath "new") newt []
]
- Annex.Branch.rememberTreeish treesha
+ commitsha <- Annex.Branch.rememberTreeish treesha
(asTopFilePath migrationTreeGraftPoint)
+ committedMigration commitsha
+
+-- Streams distributed migrations from the git-annex branch that have not
+-- been performed here, and runs the provided action on each old and new
+-- key pair.
+streamNewDistributedMigrations :: (Key -> Key -> Annex ()) -> Annex ()
+streamNewDistributedMigrations a = do
+ void Annex.Branch.update
+ branchsha <- Annex.Branch.getBranch
+ (stoppoint, toskip) <- getPerformedMigrations
+ (l, cleanup) <- inRepo $ getGitLog branchsha stoppoint
+ [fromRawFilePath migrationTreeGraftPoint]
+ -- Need to follow because migrate.tree is grafted in
+ -- and then deleted, and normally git log stops when a file
+ -- gets deleted.
+ ([Param "--reverse", Param "--follow"])
+ (\sha _file -> Just sha)
+ forM_ l $ \c ->
+ unless (changed c `elem` toskip) $ do
+ moldkey <- catKey XXX
+ mnewkey <- catKey YYY
+ case (moldkey, mnewkey) of
+ (Just oldkey, Just newkey) -> a oldkey newkey
+ _ -> return ()
+ liftIO $ void cleanup
+ recordPerformedMigrations branchsha toskip
+
+getPerformedMigrations :: Annex (Maybe Sha, [Sha])
+getPerformedMigrations = do
+ logf <- fromRepo gitAnnexMigrationsLog
+ lckf <- fromRepo gitAnnexMigrationsLock
+ ls <- calcLogFile logf lckf [] (:)
+ return $ case reverse ls of
+ [] -> (Nothing, [])
+ (stoppoint:toskip) ->
+ let stoppoint' = conv stoppoint
+ in
+ ( if stoppoint' `elem` nullShas
+ then Nothing
+ else Just stoppoint'
+ , map conv toskip
+ )
+ where
+ conv = Git.Ref . L.toStrict
+
+-- Record locally that migrations have been performed up to the given
+-- commit. The list is additional commits that can be removed from the
+-- log file if present.
+recordPerformedMigrations :: Sha -> [Sha] -> Annex ()
+recordPerformedMigrations sha toremove = do
+ logf <- fromRepo gitAnnexMigrationsLog
+ lckf <- fromRepo gitAnnexMigrationsLock
+ modifyLogFile logf lckf (update . drop 1)
+ where
+ update l = L.fromStrict (fromRef' sha) : filter (`notElem` toremove') l
+ toremove' = map (L.fromStrict . fromRef') toremove
+
+-- Record that a migration was performed locally and committed.
+-- Since committing a migration may result in parent migrations that have
+-- not yet been processed locally, that commit cannot be the first line of
+-- the log file, which is reserved for commits whose parents have also had
+-- their migrations handled. So if the log file does not exist or is empty,
+-- make the first line a null sha.
+committedMigration :: Sha -> Annex ()
+committedMigration commitsha = do
+ logf <- fromRepo gitAnnexMigrationsLog
+ lckf <- fromRepo gitAnnexMigrationsLock
+ modifyLogFile logf lckf update
+ where
+ update [] = [conv deleteSha, conv commitsha]
+ update logged = logged ++ [conv commitsha]
+ conv = L.fromStrict . fromRef'
diff --git a/doc/git-annex-migrate.mdwn b/doc/git-annex-migrate.mdwn
index 30d24f943f..7361ca03f9 100644
--- a/doc/git-annex-migrate.mdwn
+++ b/doc/git-annex-migrate.mdwn
@@ -6,6 +6,8 @@ git-annex migrate - switch data to different backend
git annex migrate `[path ...]`
+git annex migrate --update
+
# DESCRIPTION
Changes the specified annexed files to use the default key-value backend
@@ -19,11 +21,17 @@ Normally, nothing will be done to files already using the new backend.
However, if a backend changes the information it uses to construct a key,
this can also be used to migrate files to use the new key format.
-When you have multiple repositories that each contain a copy of a file,
-it's best to run migrate in all of them.
-
# OPTIONS
+* `--update`
+
+ This updates the local repository for migrations that were performed
+ elsewhere.
+
+ Note that older versions of git-annex did not record migrations in a
+ way that this can use. Migrations performed with those older versions
+ had to be manually run in each clone of the repository.
+
* `--backend`
Specify the new key-value backend to use for migrated data.