diff options
Diffstat (limited to 'scripts')
-rwxr-xr-x | scripts/dtrx | 170 |
1 files changed, 129 insertions, 41 deletions
diff --git a/scripts/dtrx b/scripts/dtrx index 7a98ba5..2fc99e3 100755 --- a/scripts/dtrx +++ b/scripts/dtrx @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- # # dtrx -- Intelligently extract various archive types. -# Copyright © 2006-2009 Brett Smith <brettcsmith@brettcsmith.org> +# Copyright © 2006-2011 Brett Smith <brettcsmith@brettcsmith.org> # Copyright © 2008 Peter Kelemen <Peter.Kelemen@gmail.com> # # This program is free software; you can redistribute it and/or modify it @@ -38,15 +38,16 @@ import tempfile import termios import textwrap import traceback +import urlparse try: set except NameError: from sets import Set as set -VERSION = "6.6" +VERSION = "7.1" VERSION_BANNER = """dtrx version %s -Copyright © 2006-2009 Brett Smith <brettcsmith@brettcsmith.org> +Copyright © 2006-2011 Brett Smith <brettcsmith@brettcsmith.org> Copyright © 2008 Peter Kelemen <Peter.Kelemen@gmail.com> This program is free software; you can redistribute it and/or modify it @@ -81,6 +82,7 @@ RECURSE_LIST = 5 mimetypes.encodings_map.setdefault('.bz2', 'bzip2') mimetypes.encodings_map.setdefault('.lzma', 'lzma') mimetypes.encodings_map.setdefault('.xz', 'xz') +mimetypes.encodings_map.setdefault('.lz', 'lzip') mimetypes.types_map.setdefault('.gem', 'application/x-ruby-gem') logger = logging.getLogger('dtrx-log') @@ -138,8 +140,8 @@ class ExtractorUnusable(Exception): EXTRACTION_ERRORS = (ExtractorError, ExtractorUnusable, OSError, IOError) class BaseExtractor(object): - decoders = {'bzip2': 'bzcat', 'gzip': 'zcat', 'compress': 'zcat', - 'lzma': 'lzcat', 'xz': 'xzcat'} + decoders = {'bzip2': ['bzcat'], 'gzip': ['zcat'], 'compress': ['zcat'], + 'lzma': ['lzcat'], 'xz': ['xzcat'], 'lzip': ['lzip', '-cd']} name_checker = DirectoryChecker def __init__(self, filename, encoding): @@ -161,18 +163,12 @@ class BaseExtractor(object): raise ExtractorError("could not open %s: %s" % (filename, error.strerror)) if encoding: - self.pipe([self.decoders[encoding]], "decoding") + self.pipe(self.decoders[encoding], "decoding") self.prepare() def pipe(self, command, description="extraction"): self.pipes.append((command, description)) - def first_bad_exit_code(self): - for index, code in enumerate(self.exit_codes): - if code != 0: - return index - return None - def add_process(self, processes, command, stdin, stdout): try: processes.append(subprocess.Popen(command, stdin=stdin, @@ -243,7 +239,16 @@ class BaseExtractor(object): def basename(self): pieces = os.path.basename(self.filename).split('.') + orig_len = len(pieces) extension = '.' + pieces[-1] + # This is maybe a little more clever than it ought to be. + # We're trying to be conservative about what remove, but also DTRT + # in cases like .tar.gz, and also do something reasonable if we + # encounter some completely off-the-wall extension. So that means: + # 1. First remove any compression extension. + # 2. Then remove any commonly known extension that remains. + # 3. If neither of those did anything, remove anything that looks + # like it's almost certainly an extension (less than 5 chars). if mimetypes.encodings_map.has_key(extension): pieces.pop() extension = '.' + pieces[-1] @@ -251,6 +256,9 @@ class BaseExtractor(object): mimetypes.common_types.has_key(extension) or mimetypes.suffix_map.has_key(extension)): pieces.pop() + if ((orig_len == len(pieces)) and + (orig_len > 1) and (len(pieces[-1]) < 5)): + pieces.pop() return '.'.join(pieces) def get_stderr(self): @@ -259,13 +267,25 @@ class BaseExtractor(object): self.stderr.close() return errors - def check_success(self, got_output): - error_index = self.first_bad_exit_code() - if (not got_output) and (error_index is not None): + def is_fatal_error(self, status): + return False + + def first_bad_exit_code(self): + for index, code in enumerate(self.exit_codes): + if code > 0: + return index, code + return None, None + + def check_success(self, got_files): + error_index, error_code = self.first_bad_exit_code() + logger.debug("success results: %s %s %s" % (got_files, error_index, + self.exit_codes)) + if (self.is_fatal_error(error_code) or + ((not got_files) and (error_code is not None))): command = ' '.join(self.pipes[error_index][0]) raise ExtractorError("%s error: '%s' returned status code %s" % (self.pipes[error_index][1], command, - self.exit_codes[error_index])) + error_code)) def extract_archive(self): self.pipe(self.extract_pipe) @@ -340,6 +360,7 @@ class CompressionExtractor(BaseExtractor): self.content_type = ONE_ENTRY_KNOWN self.content_name = self.basename() self.contents = None + self.file_count = 1 self.included_root = './' try: output_fd, self.target = tempfile.mkstemp(prefix='.dtrx-', dir='.') @@ -352,6 +373,7 @@ class CompressionExtractor(BaseExtractor): except EXTRACTION_ERRORS: os.unlink(self.target) raise + class TarExtractor(BaseExtractor): file_type = 'tar file' @@ -410,7 +432,7 @@ class DebExtractor(TarExtractor): raise ExtractorError("data.tar file has unrecognized encoding") self.pipe(['ar', 'p', self.filename, data_filename], "extracting data.tar from .deb") - self.pipe([self.decoders[encoding]], "decoding data.tar") + self.pipe(self.decoders[encoding], "decoding data.tar") def basename(self): pieces = os.path.basename(self.filename).split('_') @@ -486,6 +508,39 @@ class ZipExtractor(NoPipeExtractor): extract_command = ['unzip', '-q'] list_command = ['zipinfo', '-1'] + def is_fatal_error(self, status): + return status > 1 + + +class LZHExtractor(ZipExtractor): + file_type = 'LZH file' + extract_command = ['lha', 'xq'] + list_command = ['lha', 'l'] + + def border_line_file_index(self, line): + last_space_index = None + for index, char in enumerate(line): + if char == ' ': + last_space_index = index + elif char != '-': + return None + if last_space_index is None: + return None + return last_space_index + 1 + + def get_filenames(self): + filenames = NoPipeExtractor.get_filenames(self) + for line in filenames: + fn_index = self.border_line_file_index(line) + if fn_index is not None: + break + for line in filenames: + if self.border_line_file_index(line): + break + else: + yield line[fn_index:] + self.archive.close() + class SevenExtractor(NoPipeExtractor): file_type = '7z file' @@ -673,7 +728,9 @@ class EmptyHandler(object): return contents == EMPTY can_handle = staticmethod(can_handle) - def __init__(self, extractor, options): pass + def __init__(self, extractor, options): + os.rmdir(extractor.target) + def handle(self): pass @@ -799,7 +856,8 @@ class RecursionPolicy(BasePolicy): def prep(self, current_filename, target, extractor): archive_count = len(extractor.included_archives) - if (self.permanent_policy is not None) or (archive_count == 0): + if ((self.permanent_policy is not None) or + ((archive_count * 10) <= extractor.file_count)): self.current_policy = self.permanent_policy or RECURSE_NOT_NOW return question = self.wrap( @@ -825,48 +883,56 @@ class RecursionPolicy(BasePolicy): class ExtractorBuilder(object): - extractor_map = {'tar': {'extractor': TarExtractor, + extractor_map = {'tar': {'extractors': (TarExtractor,), 'mimetypes': ('x-tar',), 'extensions': ('tar',), 'magic': ('POSIX tar archive',)}, - 'zip': {'extractor': ZipExtractor, + 'zip': {'extractors': (ZipExtractor, SevenExtractor), 'mimetypes': ('zip',), 'extensions': ('zip',), 'magic': ('(Zip|ZIP self-extracting) archive',)}, - 'rpm': {'extractor': RPMExtractor, + 'lzh': {'extractors': (LZHExtractor,), + 'mimetypes': ('x-lzh', 'x-lzh-compressed'), + 'extensions': ('lzh', 'lha'), + 'magic': ('LHa [\d\.\?]+ archive',)}, + 'rpm': {'extractors': (RPMExtractor,), 'mimetypes': ('x-redhat-package-manager', 'x-rpm'), 'extensions': ('rpm',), 'magic': ('RPM',)}, - 'deb': {'extractor': DebExtractor, - 'metadata': DebMetadataExtractor, + 'deb': {'extractors': (DebExtractor,), + 'metadata': (DebMetadataExtractor,), 'mimetypes': ('x-debian-package',), 'extensions': ('deb',), 'magic': ('Debian binary package',)}, - 'cpio': {'extractor': CpioExtractor, + 'cpio': {'extractors': (CpioExtractor,), 'mimetypes': ('x-cpio',), 'extensions': ('cpio',), 'magic': ('cpio archive',)}, - 'gem': {'extractor': GemExtractor, - 'metadata': GemMetadataExtractor, + 'gem': {'extractors': (GemExtractor,), + 'metadata': (GemMetadataExtractor,), 'mimetypes': ('x-ruby-gem',), 'extensions': ('gem',)}, - '7z': {'extractor': SevenExtractor, + '7z': {'extractors': (SevenExtractor,), 'mimetypes': ('x-7z-compressed',), 'extensions': ('7z',), 'magic': ('7-zip archive',)}, - 'cab': {'extractor': CABExtractor, + 'cab': {'extractors': (CABExtractor,), 'mimetypes': ('x-cab',), 'extensions': ('cab',), 'magic': ('Microsoft Cabinet Archive',)}, - 'rar': {'extractor': RarExtractor, + 'rar': {'extractors': (RarExtractor,), 'mimetypes': ('rar',), 'extensions': ('rar',), 'magic': ('RAR archive',)}, - 'shield': {'extractor': ShieldExtractor, + 'shield': {'extractors': (ShieldExtractor,), 'mimetypes': ('x-cab',), 'extensions': ('cab', 'hdr'), 'magic': ('InstallShield CAB',)}, - 'compress': {'extractor': CompressionExtractor} + 'msi': {'extractors': (SevenExtractor,), + 'mimetypes': ('x-msi', 'x-ole-storage'), + 'extensions': ('msi',), + 'magic': ('Application: Windows Installer',)}, + 'compress': {'extractors': (CompressionExtractor,)} } mimetype_map = {} @@ -886,6 +952,7 @@ class ExtractorBuilder(object): ('tar', 'gzip', 'tar.gz', 'tgz'), ('tar', 'lzma', 'tar.lzma', 'tlz'), ('tar', 'xz', 'tar.xz'), + ('tar', 'lz', 'tar.lz'), ('tar', 'compress', 'tar.Z', 'taz'), ('compress', 'gzip', 'Z', 'gz'), ('compress', 'bzip2', 'bz2'), @@ -898,6 +965,7 @@ class ExtractorBuilder(object): for mapping in (('bzip2', 'bzip2 compressed'), ('gzip', 'gzip compressed'), ('lzma', 'LZMA compressed'), + ('lzip', 'lzip compressed'), ('xz', 'xz compressed')): for pattern in mapping[1:]: magic_encoding_map[re.compile(pattern)] = mapping[0] @@ -907,12 +975,13 @@ class ExtractorBuilder(object): self.options = options def build_extractor(self, archive_type, encoding): - extractors = self.extractor_map[archive_type] - if self.options.metadata and extractors.has_key('metadata'): - extractor = extractors['metadata'] + type_info = self.extractor_map[archive_type] + if self.options.metadata and type_info.has_key('metadata'): + extractors = type_info['metadata'] else: - extractor = extractors['extractor'] - return extractor(self.filename, encoding) + extractors = type_info['extractors'] + for extractor in extractors: + yield extractor(self.filename, encoding) def get_extractor(self): tried_types = set() @@ -932,7 +1001,8 @@ class ExtractorBuilder(object): tried_types.add(ext_args) logger.debug("trying %s extractor from %s" % (ext_args, func_name)) - yield self.build_extractor(*ext_args) + for extractor in self.build_extractor(*ext_args): + yield extractor def try_by_mimetype(cls, filename): mimetype, encoding = mimetypes.guess_type(filename) @@ -1228,6 +1298,21 @@ class ExtractorApplication(object): self.show_stderr(logger.error, stderr) return True + def download(self, filename): + url = filename.lower() + for protocol in 'http', 'https', 'ftp': + if url.startswith(protocol + '://'): + break + else: + return filename, None + # FIXME: This can fail if there's already a file in the directory + # that matches the basename of the URL. + status = subprocess.call(['wget', '-c', filename], + stdin=subprocess.PIPE) + if status != 0: + return None, "wget returned status code %s" % (status,) + return os.path.basename(urlparse.urlparse(filename)[2]), None + def run(self): if self.options.show_list: action = ListAction @@ -1238,9 +1323,12 @@ class ExtractorApplication(object): self.current_directory, self.filenames = self.archives.popitem() os.chdir(self.current_directory) for filename in self.filenames: - builder = ExtractorBuilder(filename, self.options) - error = (self.check_file(filename) or - self.try_extractors(filename, builder.get_extractor())) + filename, error = self.download(filename) + if not error: + builder = ExtractorBuilder(filename, self.options) + error = (self.check_file(filename) or + self.try_extractors(filename, + builder.get_extractor())) if error: if error != True: logger.error("%s: %s" % (filename, error)) |