convert key2file and file2key to use builder and attoparsec

The new parser is significantly stricter than the old one: The old file2key allowed the fields to come in any order, but the new one requires the fixed order that git-annex has always used. Hopefully this will not cause any breakage. And the old file2key allowed eg SHA1-m1-m2-m3-m4-m5-m6--xxxx while the new does not allow duplication of fields. This could potentially improve security, because allowing lots of extra junk like that in a key could potentially be used in a SHA1 collision attack, although the current attacks need binary data and not this kind of structured numeric data. Speed improved of course, and fairly substantially, in microbenchmarks: benchmarking old/key2file time 2.264 μs (2.257 μs .. 2.273 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 2.265 μs (2.260 μs .. 2.275 μs) std dev 21.17 ns (13.06 ns .. 39.26 ns) benchmarking new/key2file' time 1.744 μs (1.741 μs .. 1.747 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 1.745 μs (1.742 μs .. 1.751 μs) std dev 13.55 ns (9.099 ns .. 21.89 ns) benchmarking old/file2key time 6.114 μs (6.102 μs .. 6.129 μs) 1.000 R² (1.000 R² .. 1.000 R²) mean 6.118 μs (6.106 μs .. 6.143 μs) std dev 55.00 ns (30.08 ns .. 100.2 ns) benchmarking new/file2key' time 1.791 μs (1.782 μs .. 1.801 μs) 1.000 R² (0.999 R² .. 1.000 R²) mean 1.792 μs (1.785 μs .. 1.804 μs) std dev 32.46 ns (20.59 ns .. 50.82 ns) variance introduced by outliers: 19% (moderately inflated)
author: Joey Hess <joeyh@joeyh.name> 2019-01-11 16:33:42 -0400
committer: Joey Hess <joeyh@joeyh.name> 2019-01-11 16:33:42 -0400
commit: 151562b537b1d0d6b09866e876cd05d538fbc53a (patch)
tree: 019527eae41209e80ffc2c0f700572c5c2e1ad8a /Key.hs
parent: b552551b332b540f4afac7d744cb8a1ddd790e35 (diff)
1 files changed, 96 insertions, 70 deletions
diff --git a/Key.hs b/Key.hs
index ade012a4ba..7fa4176eef 100644
--- a/Key.hs
+++ b/Key.hs
@@ -1,6 +1,6 @@
 {- git-annex Keys
  -
- - Copyright 2011-2017 Joey Hess <id@joeyh.name>
+ - Copyright 2011-2019 Joey Hess <id@joeyh.name>
  -
  - Licensed under the GNU GPL version 3 or higher.
  -}
@@ -11,19 +11,31 @@ module Key (
 	Key(..),
 	AssociatedFile(..),
 	stubKey,
-	key2file,
+	buildKeyFile,
+	keyFileParser,
 	file2key,
+	key2file,
+	file2key',
+	key2file',
 	nonChunkKey,
 	chunkKeyOffset,
 	isChunkKey,
 	isKeyPrefix,
+	splitKeyNameExtension,
 
 	prop_isomorphic_key_encode,
 	prop_isomorphic_key_decode
 ) where
 
-import Data.Char
 import qualified Data.Text as T
+import qualified Data.ByteString as S
+import qualified Data.ByteString.Char8 as S8
+import qualified Data.ByteString.Lazy as L
+import Data.ByteString.Builder
+import Data.ByteString.Builder.Extra
+import qualified Data.Attoparsec.ByteString as A
+import qualified Data.Attoparsec.ByteString.Char8 as A8
+import Foreign.C.Types
 
 import Common
 import Types.Key
@@ -34,8 +46,8 @@ import qualified Utility.SimpleProtocol as Proto
 
 stubKey :: Key
 stubKey = Key
-	{ keyName = ""
-	, keyVariety = OtherKey ""
+	{ keyName = mempty
+	, keyVariety = OtherKey mempty
 	, keySize = Nothing
 	, keyMtime = Nothing
 	, keyChunkSize = Nothing
@@ -65,69 +77,81 @@ isKeyPrefix s = [fieldSep, fieldSep] `isInfixOf` s
 fieldSep :: Char
 fieldSep = '-'
 
-{- Converts a key to a string that is suitable for use as a filename.
- - The name field is always shown last, separated by doubled fieldSeps,
+{- Builds a ByteString that is suitable for use as a filename representing
+ - a Key. The name field is always shown last, separated by doubled fieldSeps,
  - and is the only field allowed to contain the fieldSep. -}
+buildKeyFile :: Key -> Builder
+buildKeyFile k = byteString (formatKeyVariety (keyVariety k))
+	<> 's' ?: (integerDec <$> keySize k)
+	<> 'm' ?: (integerDec . (\(CTime t) -> fromIntegral t) <$> keyMtime k)
+	<> 'S' ?: (integerDec <$> keyChunkSize k)
+	<> 'C' ?: (integerDec <$> keyChunkNum k)
+	<> sepbefore (sepbefore (byteString (keyName k)))
+  where
+	sepbefore s = char7 fieldSep <> s
+	c ?: (Just b) = sepbefore (char7 c <> b)
+	_ ?: Nothing = mempty
+
 key2file :: Key -> FilePath
-key2file Key { keyVariety = kv, keySize = s, keyMtime = m, keyChunkSize = cs, keyChunkNum = cn, keyName = n } =
-	formatKeyVariety kv +++ ('s' ?: s) +++ ('m' ?: m) +++ ('S' ?: cs) +++ ('C' ?: cn) +++ (fieldSep : n)
+key2file = decodeBL . key2file'
+
+key2file' :: Key -> L.ByteString
+key2file' = toLazyByteStringWith (safeStrategy 128 smallChunkSize) L.empty . buildKeyFile
+
+{- This is a strict parser for security reasons; a key
+ - can contain only 4 fields, which all consist only of numbers.
+ - Any key containing other fields, or non-numeric data will fail
+ - to parse.
+ -
+ - If a key contained non-numeric fields, they could be used to
+ - embed data used in a SHA1 collision attack, which would be a
+ - problem since the keys are committed to git.
+ -}
+keyFileParser :: A.Parser Key
+keyFileParser = do
+	-- key variety cannot be empty
+	v <- (parseKeyVariety <$> A8.takeWhile1 (/= fieldSep))
+	s <- parsesize
+	m <- parsemtime
+	cs <- parsechunksize
+	cn <- parsechunknum
+	_ <- A8.char fieldSep
+	_ <- A8.char fieldSep
+	n <- A.takeByteString
+	if validKeyName v n
+		then return $ Key
+			{ keyName = n
+			, keyVariety = v
+			, keySize = s
+			, keyMtime = m
+			, keyChunkSize = cs
+			, keyChunkNum = cn
+			}
+		else fail "invalid keyName"
   where
-	"" +++ y = y
-	x +++ "" = x
-	x +++ y = x ++ fieldSep:y
-	f ?: (Just v) = f : show v
-	_ ?: _ = ""
+	parseopt p = (Just <$> (A8.char fieldSep *> p)) <|> pure Nothing
+	parsesize = parseopt $ A8.char 's' *> A8.decimal
+	parsemtime = parseopt $ CTime <$> (A8.char 'm' *> A8.decimal)
+	parsechunksize = parseopt $ A8.char 'S' *> A8.decimal
+	parsechunknum = parseopt $ A8.char 'C' *> A8.decimal
 
 file2key :: FilePath -> Maybe Key
-file2key s
-	| key == Just stubKey || (keyName <$> key) == Just "" || (keyVariety <$> key) == Just (OtherKey "") = Nothing
-	| otherwise = key
-  where
-	key = startbackend stubKey s
-
-	startbackend k v = sepfield k v addvariety
-		
-	sepfield k v a = case span (/= fieldSep) v of
-		(v', _:r) -> findfields r $ a k v'
-		_ -> Nothing
-
-	findfields (c:v) (Just k)
-		| c == fieldSep = addkeyname k v
-		| otherwise = sepfield k v $ addfield c
-	findfields _ v = v
-
-	addvariety k v = Just k { keyVariety = parseKeyVariety v }
-
-	-- This is a strict parser for security reasons; a key
-	-- can contain only 4 fields, which all consist only of numbers.
-	-- Any key containing other fields, or non-numeric data is
-	-- rejected with Nothing.
-	--
-	-- If a key contained non-numeric fields, they could be used to
-	-- embed data used in a SHA1 collision attack, which would be a
-	-- problem since the keys are committed to git.
-	addfield _ _ v | not (all isDigit v) = Nothing
-	addfield 's' k v = do
-		sz <- readish v
-		return $ k { keySize = Just sz }
-	addfield 'm' k v = do
-		mtime <- readish v
-		return $ k { keyMtime = Just mtime }
-	addfield 'S' k v = do
-		chunksize <- readish v
-		return $ k { keyChunkSize = Just chunksize }
-	addfield 'C' k v = case readish v of
-		Just chunknum | chunknum > 0 ->
-			return $ k { keyChunkNum = Just chunknum }
-		_ -> Nothing
-	addfield _ _ _ = Nothing
-
-	addkeyname k v
-		| validKeyName k v = Just $ k { keyName = v }
-		| otherwise = Nothing
-
-{- When a key HasExt, the length of the extension is limited in order to
- - mitigate against SHA1 collision attacks.
+file2key = file2key' . encodeBS
+
+file2key' :: S.ByteString -> Maybe Key
+file2key' b = eitherToMaybe $ A.parseOnly keyFileParser b
+
+{- This splits any extension out of the keyName, returning the 
+ - keyName minus extension, and the extension (including leading dot).
+ -}
+splitKeyNameExtension :: Key -> (S.ByteString, S.ByteString)
+splitKeyNameExtension = splitKeyNameExtension' . keyName
+
+splitKeyNameExtension' :: S.ByteString -> (S.ByteString, S.ByteString)
+splitKeyNameExtension' keyname = S8.span (/= '.') keyname
+
+{- Limits the length of the extension in the keyName to mitigate against
+ - SHA1 collision attacks.
  -
  - In such an attack, the extension of the key could be made to contain
  - the collision generation data, with the result that a signed git commit
@@ -137,23 +161,25 @@ file2key s
  - characters; 20 is used here to give a little future wiggle-room. 
  - The SHA1 common-prefix attack needs 128 bytes of data.
  -}
-validKeyName :: Key -> String -> Bool
-validKeyName k name
-	| hasExt (keyVariety k) = length (takeExtensions name) <= 20
+validKeyName :: KeyVariety -> S.ByteString -> Bool
+validKeyName kv name
+	| hasExt kv = 
+		let ext = snd $ splitKeyNameExtension' name
+		in S.length ext <= 20
 	| otherwise = True
 
 instance Arbitrary Key where
 	arbitrary = Key
-		<$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t")
-		<*> (parseKeyVariety <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND
+		<$> (encodeBS <$> (listOf1 $ elements $ ['A'..'Z'] ++ ['a'..'z'] ++ ['0'..'9'] ++ "-_\r\n \t"))
+		<*> (parseKeyVariety . encodeBS <$> (listOf1 $ elements ['A'..'Z'])) -- BACKEND
 		<*> ((abs <$>) <$> arbitrary) -- size cannot be negative
 		<*> ((abs . fromInteger <$>) <$> arbitrary) -- mtime cannot be negative
 		<*> ((abs <$>) <$> arbitrary) -- chunksize cannot be negative
 		<*> ((succ . abs <$>) <$> arbitrary) -- chunknum cannot be 0 or negative
 
 instance Hashable Key where
-	hashIO32 = hashIO32 . key2file
-	hashIO64 = hashIO64 . key2file
+	hashIO32 = hashIO32 . key2file'
+	hashIO64 = hashIO64 . key2file'
 
 instance ToJSON' Key where
 	toJSON' = toJSON' . key2file
author	Joey Hess <joeyh@joeyh.name>	2019-01-11 16:33:42 -0400
committer	Joey Hess <joeyh@joeyh.name>	2019-01-11 16:33:42 -0400
commit	151562b537b1d0d6b09866e876cd05d538fbc53a (patch)
tree	019527eae41209e80ffc2c0f700572c5c2e1ad8a /Key.hs
parent	b552551b332b540f4afac7d744cb8a1ddd790e35 (diff)