diff options
author | Andrej Shadura <andrew@shadura.me> | 2022-03-06 18:39:03 +0100 |
---|---|---|
committer | Andrej Shadura <andrew@shadura.me> | 2022-03-06 21:37:38 +0100 |
commit | 2f6897810b631838f83ca87b0ff157f9b8a2f21e (patch) | |
tree | b06d94b711a77cd39880658ddd5b5040f247edbd /git_crecord | |
parent | ad0254662b5c1912c08cc561c1fc080a657ab9da (diff) |
Implement Git path unquoting
While disabling core.quotePath helps with most non-ASCII filenames,
there are a few still affected by the quoting:
> Double-quotes, backslash and control characters are always escaped
> regardless of the setting of this variable.
To cope with filenames containing those, parse quoted paths properly.
For more details, see: https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePat
Diffstat (limited to 'git_crecord')
-rw-r--r-- | git_crecord/crecord_core.py | 17 | ||||
-rw-r--r-- | git_crecord/crpatch.py | 26 | ||||
-rw-r--r-- | git_crecord/util.py | 37 |
3 files changed, 70 insertions, 10 deletions
diff --git a/git_crecord/crecord_core.py b/git_crecord/crecord_core.py index 38a7191..f8e01d4 100644 --- a/git_crecord/crecord_core.py +++ b/git_crecord/crecord_core.py @@ -37,15 +37,18 @@ def dorecord(ui, repo, commitfunc, *pats, **opts): In the end we'll record interesting changes, and everything else will be left in place, so the user can continue his work. - Disable `core.quotePath` to support non-ASCII filenames. - By default (with `core.quotePath=true`) `git diff` only shows filename characters printable in ASCII, - and the presence of any character higher than U+007F will cause `git diff`'s output to double-quote - the filename and replace the non-ASCII characters in that filename with their octal representations. - The double-quoting (i.e. `diff --git "a/` instead of `diff --git a/`) breaks `crecord`'s stdout parsing. - https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath + We pass additional configuration options to Git to make the diff output + consistent: + - core.quotePath: + Limit symbols requiring escaping to double-quotes, backslash and + control characters. + - diff.mnemonicPrefix: + If set, git diff uses a prefix pair that is different from the standard + "a/" and "b/" depending on what is being compared. Our parser only + supports "a/" and "b/". """ - git_args = ["git", "-c", "core.quotepath=false", "-c", "diff.mnemonicprefix=false", "diff", "--binary"] + git_args = ["git", "-c", "core.quotePath=false", "-c", "diff.mnemonicPrefix=false", "diff", "--binary"] git_base = [] if opts['cached']: diff --git a/git_crecord/crpatch.py b/git_crecord/crpatch.py index f9a91a2..01549b2 100644 --- a/git_crecord/crpatch.py +++ b/git_crecord/crpatch.py @@ -9,6 +9,8 @@ from codecs import register_error from typing import IO, Iterator, Optional, Sequence, Union +from .util import unwrap_filename + lines_re = re.compile(b'@@ -(\\d+)(?:,(\\d+))? \\+(\\d+)(?:,(\\d+))? @@\\s*(.*)') @@ -106,7 +108,7 @@ def scanpatch(fp: IO[bytes]): return lines for line in iter(lr.readline, b''): - if line.startswith(b'diff --git a/'): + if line.startswith(b'diff --git a/') or line.startswith(b'diff --git "a/'): def notheader(line: bytes) -> bool: s = line.split(None, 1) return not s or s[0] not in (b'---', b'diff') @@ -263,7 +265,7 @@ class PatchNode: class Header(PatchNode): """Patch header""" - diff_re = re.compile(b'diff --git a/(.*) b/(.*)$') + diff_re = re.compile(b'diff --git (?P<fromfile>(?P<aq>")?a/.*(?(aq)"|)) (?P<tofile>(?P<bq>")?b/.*(?(bq)"|))$') allhunks_re = re.compile(b'(?:GIT binary patch|new file|deleted file) ') pretty_re = re.compile(b'(?:new file|deleted file) ') special_re = re.compile(b'(?:GIT binary patch|new|deleted|copy|rename) ') @@ -331,7 +333,9 @@ class Header(PatchNode): return any(self.allhunks_re.match(h) for h in self.header) def files(self): - fromfile, tofile = self.diff_re.match(self.header[0]).groups() + fromfile, tofile = self.diff_re.match(self.header[0]).group('fromfile', 'tofile') + fromfile = unwrap_filename(fromfile).removeprefix(b'a/') + tofile = unwrap_filename(tofile).removeprefix(b'b/') if self.changetype == 'D': tofile = None elif self.changetype == 'A': @@ -757,6 +761,22 @@ def parsepatch(fp: IO[bytes]) -> PatchRoot: @@ -0,0 +1,2 @@ +<CD><CE><CD>-<D3><D2><D4>-8 <F2><E5><F1><F2> +test + + Quoted filenames in the diff headers are supported too: + >>> rawpatch = b'''diff --git "a/test- \\\\\\321\\217\\321\\217" "b/test- \\\\\\321\\217\\321\\217" + ... new file mode 100644 + ... index 000000000000..7f53c853ca78 + ... --- /dev/null + ... +++ "b/test- \\\\\\321\\217\\321\\217" + ... @@ -0,0 +1,2 @@ + ... +\xCD\xCE\xCD-\xD3\xD2\xD4-8 \xF2\xE5\xF1\xF2 + ... +test''' + >>> fp = io.BytesIO(rawpatch) + >>> patch = parsepatch(fp) + >>> files = patch.headers[0].files() + >>> files[0] + >>> files[1].decode('UTF-8') + 'test- \\яя' """ class Parser: diff --git a/git_crecord/util.py b/git_crecord/util.py index eb84e9a..6c1065f 100644 --- a/git_crecord/util.py +++ b/git_crecord/util.py @@ -201,3 +201,40 @@ _notset = object() def safehasattr(thing, attr): return getattr(thing, attr, _notset) is not _notset + + +def unescape_filename(filename: bytes) -> bytes: + r"""Unescape a filename after Git mangled it for "git diff --git" line. + + >>> unescape_filename(b'a/\\321\\216\\321\\217') + b'a/\xd1\x8e\xd1\x8f' + >>> unescape_filename(b'a/\\\\') + b'a/\\' + >>> unescape_filename(b'a/file\\55name') + b'a/file-name' + """ + unescaped_unicode = filename.decode('unicode_escape') + return bytes(ord(x) for x in unescaped_unicode) + + +def unwrap_filename(filename: bytes) -> bytes: + r"""Unwrap a filename mangled by Git + + If the filename is in double quotes, remove them and unescape enclosed characters. + Otherwise, return the input as is. + + >>> def apply(f, s: str) -> str: + ... return f(s.encode("UTF-8")).decode("UTF-8") + >>> apply(unwrap_filename, 'a/filename') + 'a/filename' + >>> apply(unwrap_filename, 'a/имя-файла') + 'a/имя-файла' + >>> apply(unwrap_filename, '"a/file\\55name"') + 'a/file-name' + >>> apply(unwrap_filename, '"a/им\\321\\217\55\\\\name"') + 'a/имя-\\name' + """ + if filename.startswith(b'"') and filename.endswith(b'"'): + return unescape_filename(filename[1:-1]) + else: + return filename |