summaryrefslogtreecommitdiff
path: root/git_crecord
diff options
context:
space:
mode:
authorAndrej Shadura <andrew@shadura.me>2022-03-06 18:39:03 +0100
committerAndrej Shadura <andrew@shadura.me>2022-03-06 21:37:38 +0100
commit2f6897810b631838f83ca87b0ff157f9b8a2f21e (patch)
treeb06d94b711a77cd39880658ddd5b5040f247edbd /git_crecord
parentad0254662b5c1912c08cc561c1fc080a657ab9da (diff)
Implement Git path unquoting
While disabling core.quotePath helps with most non-ASCII filenames, there are a few still affected by the quoting: > Double-quotes, backslash and control characters are always escaped > regardless of the setting of this variable. To cope with filenames containing those, parse quoted paths properly. For more details, see: https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePat
Diffstat (limited to 'git_crecord')
-rw-r--r--git_crecord/crecord_core.py17
-rw-r--r--git_crecord/crpatch.py26
-rw-r--r--git_crecord/util.py37
3 files changed, 70 insertions, 10 deletions
diff --git a/git_crecord/crecord_core.py b/git_crecord/crecord_core.py
index 38a7191..f8e01d4 100644
--- a/git_crecord/crecord_core.py
+++ b/git_crecord/crecord_core.py
@@ -37,15 +37,18 @@ def dorecord(ui, repo, commitfunc, *pats, **opts):
In the end we'll record interesting changes, and everything else will be
left in place, so the user can continue his work.
- Disable `core.quotePath` to support non-ASCII filenames.
- By default (with `core.quotePath=true`) `git diff` only shows filename characters printable in ASCII,
- and the presence of any character higher than U+007F will cause `git diff`'s output to double-quote
- the filename and replace the non-ASCII characters in that filename with their octal representations.
- The double-quoting (i.e. `diff --git "a/` instead of `diff --git a/`) breaks `crecord`'s stdout parsing.
- https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
+ We pass additional configuration options to Git to make the diff output
+ consistent:
+ - core.quotePath:
+ Limit symbols requiring escaping to double-quotes, backslash and
+ control characters.
+ - diff.mnemonicPrefix:
+ If set, git diff uses a prefix pair that is different from the standard
+ "a/" and "b/" depending on what is being compared. Our parser only
+ supports "a/" and "b/".
"""
- git_args = ["git", "-c", "core.quotepath=false", "-c", "diff.mnemonicprefix=false", "diff", "--binary"]
+ git_args = ["git", "-c", "core.quotePath=false", "-c", "diff.mnemonicPrefix=false", "diff", "--binary"]
git_base = []
if opts['cached']:
diff --git a/git_crecord/crpatch.py b/git_crecord/crpatch.py
index f9a91a2..01549b2 100644
--- a/git_crecord/crpatch.py
+++ b/git_crecord/crpatch.py
@@ -9,6 +9,8 @@ from codecs import register_error
from typing import IO, Iterator, Optional, Sequence, Union
+from .util import unwrap_filename
+
lines_re = re.compile(b'@@ -(\\d+)(?:,(\\d+))? \\+(\\d+)(?:,(\\d+))? @@\\s*(.*)')
@@ -106,7 +108,7 @@ def scanpatch(fp: IO[bytes]):
return lines
for line in iter(lr.readline, b''):
- if line.startswith(b'diff --git a/'):
+ if line.startswith(b'diff --git a/') or line.startswith(b'diff --git "a/'):
def notheader(line: bytes) -> bool:
s = line.split(None, 1)
return not s or s[0] not in (b'---', b'diff')
@@ -263,7 +265,7 @@ class PatchNode:
class Header(PatchNode):
"""Patch header"""
- diff_re = re.compile(b'diff --git a/(.*) b/(.*)$')
+ diff_re = re.compile(b'diff --git (?P<fromfile>(?P<aq>")?a/.*(?(aq)"|)) (?P<tofile>(?P<bq>")?b/.*(?(bq)"|))$')
allhunks_re = re.compile(b'(?:GIT binary patch|new file|deleted file) ')
pretty_re = re.compile(b'(?:new file|deleted file) ')
special_re = re.compile(b'(?:GIT binary patch|new|deleted|copy|rename) ')
@@ -331,7 +333,9 @@ class Header(PatchNode):
return any(self.allhunks_re.match(h) for h in self.header)
def files(self):
- fromfile, tofile = self.diff_re.match(self.header[0]).groups()
+ fromfile, tofile = self.diff_re.match(self.header[0]).group('fromfile', 'tofile')
+ fromfile = unwrap_filename(fromfile).removeprefix(b'a/')
+ tofile = unwrap_filename(tofile).removeprefix(b'b/')
if self.changetype == 'D':
tofile = None
elif self.changetype == 'A':
@@ -757,6 +761,22 @@ def parsepatch(fp: IO[bytes]) -> PatchRoot:
@@ -0,0 +1,2 @@
+<CD><CE><CD>-<D3><D2><D4>-8 <F2><E5><F1><F2>
+test
+
+ Quoted filenames in the diff headers are supported too:
+ >>> rawpatch = b'''diff --git "a/test- \\\\\\321\\217\\321\\217" "b/test- \\\\\\321\\217\\321\\217"
+ ... new file mode 100644
+ ... index 000000000000..7f53c853ca78
+ ... --- /dev/null
+ ... +++ "b/test- \\\\\\321\\217\\321\\217"
+ ... @@ -0,0 +1,2 @@
+ ... +\xCD\xCE\xCD-\xD3\xD2\xD4-8 \xF2\xE5\xF1\xF2
+ ... +test'''
+ >>> fp = io.BytesIO(rawpatch)
+ >>> patch = parsepatch(fp)
+ >>> files = patch.headers[0].files()
+ >>> files[0]
+ >>> files[1].decode('UTF-8')
+ 'test- \\яя'
"""
class Parser:
diff --git a/git_crecord/util.py b/git_crecord/util.py
index eb84e9a..6c1065f 100644
--- a/git_crecord/util.py
+++ b/git_crecord/util.py
@@ -201,3 +201,40 @@ _notset = object()
def safehasattr(thing, attr):
return getattr(thing, attr, _notset) is not _notset
+
+
+def unescape_filename(filename: bytes) -> bytes:
+ r"""Unescape a filename after Git mangled it for "git diff --git" line.
+
+ >>> unescape_filename(b'a/\\321\\216\\321\\217')
+ b'a/\xd1\x8e\xd1\x8f'
+ >>> unescape_filename(b'a/\\\\')
+ b'a/\\'
+ >>> unescape_filename(b'a/file\\55name')
+ b'a/file-name'
+ """
+ unescaped_unicode = filename.decode('unicode_escape')
+ return bytes(ord(x) for x in unescaped_unicode)
+
+
+def unwrap_filename(filename: bytes) -> bytes:
+ r"""Unwrap a filename mangled by Git
+
+ If the filename is in double quotes, remove them and unescape enclosed characters.
+ Otherwise, return the input as is.
+
+ >>> def apply(f, s: str) -> str:
+ ... return f(s.encode("UTF-8")).decode("UTF-8")
+ >>> apply(unwrap_filename, 'a/filename')
+ 'a/filename'
+ >>> apply(unwrap_filename, 'a/имя-файла')
+ 'a/имя-файла'
+ >>> apply(unwrap_filename, '"a/file\\55name"')
+ 'a/file-name'
+ >>> apply(unwrap_filename, '"a/им\\321\\217\55\\\\name"')
+ 'a/имя-\\name'
+ """
+ if filename.startswith(b'"') and filename.endswith(b'"'):
+ return unescape_filename(filename[1:-1])
+ else:
+ return filename