1 files changed, 576 insertions, 0 deletions
diff --git a/pyaxmlparser/axmlparser.py b/pyaxmlparser/axmlparser.py
new file mode 100644
index 0000000..53678fa
--- /dev/null
+++ b/pyaxmlparser/axmlparser.py
@@ -0,0 +1,576 @@
+# This file is part of Androguard.
+#
+# Copyright (C) 2012, Anthony Desnos <desnos at t0t0.fr>
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from struct import unpack
+
+import pyaxmlparser.constants as const
+from pyaxmlparser import bytecode
+from pyaxmlparser.stringblock import StringBlock
+from pyaxmlparser.resources import public
+from .arscutil import ARSCHeader
+
+log = logging.getLogger("pyaxmlparser.axmlparser")
+
+
+class AXMLParser(object):
+    """
+    AXMLParser reads through all chunks in the AXML file
+    and implements a state machine to return information about
+    the current chunk, which can then be read by :class:`~AXMLPrinter`.
+
+    An AXML file is a file which contains multiple chunks of data, defined
+    by the `ResChunk_header`.
+    There is no real file magic but as the size of the first header is fixed
+    and the `type` of the `ResChunk_header` is set to `RES_XML_TYPE`, a file
+    will usually start with `0x03000800`.
+    But there are several examples where the `type` is set to something
+    else, probably in order to fool parsers.
+
+    Typically the AXMLParser is used in a loop which terminates if `m_event` is set to `END_DOCUMENT`.
+    You can use the `next()` function to get the next chunk.
+    Note that not all chunk types are yielded from the iterator! Some chunks are processed in
+    the AXMLParser only.
+    The parser will set `is_valid()` to False if it parses something not valid.
+    Messages what is wrong are logged.
+
+    See http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/include/androidfw/ResourceTypes.h#563
+    """
+    def __init__(self, raw_buff):
+        self._reset()
+
+        self._valid = True
+        self.axml_tampered = False
+        self.buff = bytecode.BuffHandle(raw_buff)
+
+        # Minimum is a single ARSCHeader, which would be a strange edge case...
+        if self.buff.size() < 8:
+            log.error("Filesize is too small to be a valid AXML file! Filesize: {}".format(self.buff.size()))
+            self._valid = False
+            return
+
+        # This would be even stranger, if an AXML file is larger than 4GB...
+        # But this is not possible as the maximum chunk size is a unsigned 4 byte int.
+        if self.buff.size() > 0xFFFFFFFF:
+            log.error("Filesize is too large to be a valid AXML file! Filesize: {}".format(self.buff.size()))
+            self._valid = False
+            return
+
+        try:
+            axml_header = ARSCHeader(self.buff)
+        except AssertionError as e:
+            log.error("Error parsing first resource header: %s", e)
+            self._valid = False
+            return
+
+        self.filesize = axml_header.size
+
+        if axml_header.header_size == 28024:
+            # Can be a common error: the file is not an AXML but a plain XML
+            # The file will then usually start with '<?xm' / '3C 3F 78 6D'
+            log.warning("Header size is 28024! Are you trying to parse a plain XML file?")
+
+        if axml_header.header_size != 8:
+            log.error(
+                "This does not look like an AXML file. "
+                "header size does not equal 8! header size = {}".format(
+                    axml_header.header_size
+                )
+            )
+            self._valid = False
+            return
+
+        if self.filesize > self.buff.size():
+            log.error(
+                "This does not look like an AXML file. "
+                "Declared filesize does not match real size: {} vs {}".format(
+                    self.filesize, self.buff.size()
+                )
+            )
+            self._valid = False
+            return
+
+        if self.filesize < self.buff.size():
+            # The file can still be parsed up to the point where the chunk should end.
+            self.axml_tampered = True
+            log.warning(
+                "Declared filesize ({}) is smaller than total file size ({}). "
+                "Was something appended to the file? Trying to parse it anyways.".format(
+                    self.filesize, self.buff.size()
+                )
+            )
+
+        # Not that severe of an error, we have plenty files where this is not
+        # set correctly
+        if axml_header.type != const.RES_XML_TYPE:
+            self.axml_tampered = True
+            log.warning(
+                "AXML file has an unusual resource type! "
+                "Malware likes to to such stuff to anti androguard! "
+                "But we try to parse it anyways. "
+                "Resource Type: 0x{:04x}".format(axml_header.type)
+            )
+
+        # Now we parse the STRING POOL
+        try:
+            header = ARSCHeader(self.buff)
+        except AssertionError as e:
+            log.error("Error parsing resource header of string pool: %s", e)
+            self._valid = False
+            return
+
+        if header.header_size != 0x1C:
+            log.error(
+                "This does not look like an AXML file. String chunk header "
+                "size does not equal 28! header size = {}".format(
+                    header.header_size
+                )
+            )
+            self._valid = False
+            return
+
+        if header.type != const.RES_STRING_POOL_TYPE:
+            log.error(
+                "Expected String Pool header, got resource type 0x{:04x} "
+                "instead".format(header.type)
+            )
+            self._valid = False
+            return
+
+        self.sb = StringBlock(self.buff, header)
+
+        # Stores resource ID mappings, if any
+        self.m_resourceIDs = []
+
+        # Store a list of prefix/uri mappings encountered
+        self.namespaces = []
+
+    def is_valid(self):
+        """
+        Get the state of the AXMLPrinter.
+        if an error happend somewhere in the process of parsing the file,
+        this flag is set to False.
+        """
+        return self._valid
+
+    def _reset(self):
+        self.m_event = -1
+        self.m_lineNumber = -1
+        self.m_name = -1
+        self.m_namespaceUri = -1
+        self.m_attributes = []
+        self.m_idAttribute = -1
+        self.m_classAttribute = -1
+        self.m_styleAttribute = -1
+
+    def __next__(self):
+        self._do_next()
+        return self.m_event
+
+    next = __next__  # For Python 2 compatibility
+
+    def _do_next(self):
+        if self.m_event == const.END_DOCUMENT:
+            return
+
+        self._reset()
+        while self._valid:
+            # Stop at the declared filesize or at the end of the file
+            if self.buff.end() or self.buff.get_idx() == self.filesize:
+                self.m_event = const.END_DOCUMENT
+                break
+
+            # Again, we read an ARSCHeader
+            try:
+                h = ARSCHeader(self.buff)
+            except AssertionError as e:
+                log.error("Error parsing resource header: %s", e)
+                self._valid = False
+                return
+
+            # Special chunk: Resource Map. This chunk might be contained inside
+            # the file, after the string pool.
+            if h.type == const.RES_XML_RESOURCE_MAP_TYPE:
+                log.debug("AXML contains a RESOURCE MAP")
+                # Check size: < 8 bytes mean that the chunk is not complete
+                # Should be aligned to 4 bytes.
+                if h.size < 8 or (h.size % 4) != 0:
+                    log.error("Invalid chunk size in chunk XML_RESOURCE_MAP")
+                    self._valid = False
+                    return
+
+                for i in range((h.size - h.header_size) // 4):
+                    self.m_resourceIDs.append(unpack('<L', self.buff.read(4))[0])
+
+                continue
+
+            # Parse now the XML chunks.
+            # unknown chunk types might cause problems, but we can skip them!
+            if h.type < const.RES_XML_FIRST_CHUNK_TYPE or h.type > const.RES_XML_LAST_CHUNK_TYPE:
+                # h.size is the size of the whole chunk including the header.
+                # We read already 8 bytes of the header, thus we need to
+                # subtract them.
+                log.error("Not a XML resource chunk type: 0x{:04x}. Skipping {} bytes".format(h.type, h.size))
+                self.buff.set_idx(h.end)
+                continue
+
+            # Check that we read a correct header
+            if h.header_size != 0x10:
+                log.error(
+                    "XML Resource Type Chunk header size does not match 16! "
+                    "At chunk type 0x{:04x}, declared header size={}, "
+                    "chunk size={}".format(h.type, h.header_size, h.size)
+                )
+                self._valid = False
+                return
+
+            # Line Number of the source file, only used as meta information
+            self.m_lineNumber, = unpack('<L', self.buff.read(4))
+
+            # Comment_Index (usually 0xFFFFFFFF)
+            self.m_comment_index, = unpack('<L', self.buff.read(4))
+
+            if self.m_comment_index != 0xFFFFFFFF and h.type in [
+                    const.RES_XML_START_NAMESPACE_TYPE,
+                    const.RES_XML_END_NAMESPACE_TYPE]:
+                log.warning("Unhandled Comment at namespace chunk: '{}'".format(
+                    self.sb[self.m_comment_index])
+                )
+
+            if h.type == const.RES_XML_START_NAMESPACE_TYPE:
+                prefix, = unpack('<L', self.buff.read(4))
+                uri, = unpack('<L', self.buff.read(4))
+
+                s_prefix = self.sb[prefix]
+                s_uri = self.sb[uri]
+
+                log.debug(
+                    "Start of Namespace mapping: prefix "
+                    "{}: '{}' --> uri {}: '{}'".format(
+                        prefix, s_prefix, uri, s_uri
+                    )
+                )
+
+                if s_uri == '':
+                    log.warning("Namespace prefix '{}' resolves to empty URI. "
+                                "This might be a packer.".format(s_prefix))
+
+                if (prefix, uri) in self.namespaces:
+                    log.info(
+                        "Namespace mapping ({}, {}) already seen! "
+                        "This is usually not a problem but could indicate "
+                        "packers or broken AXML compilers.".format(prefix, uri))
+                self.namespaces.append((prefix, uri))
+
+                # We can continue with the next chunk, as we store the namespace
+                # mappings for each tag
+                continue
+
+            if h.type == const.RES_XML_END_NAMESPACE_TYPE:
+                # END_PREFIX contains again prefix and uri field
+                prefix, = unpack('<L', self.buff.read(4))
+                uri, = unpack('<L', self.buff.read(4))
+
+                # We remove the last namespace mapping matching
+                if (prefix, uri) in self.namespaces:
+                    self.namespaces.remove((prefix, uri))
+                else:
+                    log.warning(
+                        "Reached a NAMESPACE_END without having the namespace stored before? "
+                        "Prefix ID: {}, URI ID: {}".format(prefix, uri)
+                    )
+
+                # We can continue with the next chunk, as we store the namespace
+                # mappings for each tag
+                continue
+
+            # START_TAG is the start of a new tag.
+            if h.type == const.RES_XML_START_ELEMENT_TYPE:
+                # The TAG consists of some fields:
+                # * (chunk_size, line_number, comment_index - we read before)
+                # * namespace_uri
+                # * name
+                # * flags
+                # * attribute_count
+                # * class_attribute
+                # After that, there are two lists of attributes, 20 bytes each
+
+                # Namespace URI (String ID)
+                self.m_namespaceUri, = unpack('<L', self.buff.read(4))
+                # Name of the Tag (String ID)
+                self.m_name, = unpack('<L', self.buff.read(4))
+                # FIXME: Flags
+                _ = self.buff.read(4)  # noqa
+                # Attribute Count
+                attributeCount, = unpack('<L', self.buff.read(4))
+                # Class Attribute
+                self.m_classAttribute, = unpack('<L', self.buff.read(4))
+
+                self.m_idAttribute = (attributeCount >> 16) - 1
+                self.m_attribute_count = attributeCount & 0xFFFF
+                self.m_styleAttribute = (self.m_classAttribute >> 16) - 1
+                self.m_classAttribute = (self.m_classAttribute & 0xFFFF) - 1
+
+                # Now, we parse the attributes.
+                # Each attribute has 5 fields of 4 byte
+                for i in range(0, self.m_attribute_count * const.ATTRIBUTE_LENGHT):
+                    # Each field is linearly parsed into the array
+                    # Each Attribute contains:
+                    # * Namespace URI (String ID)
+                    # * Name (String ID)
+                    # * Value
+                    # * Type
+                    # * Data
+                    self.m_attributes.append(unpack('<L', self.buff.read(4))[0])
+
+                # Then there are class_attributes
+                for i in range(const.ATTRIBUTE_IX_VALUE_TYPE, len(self.m_attributes), const.ATTRIBUTE_LENGHT):
+                    self.m_attributes[i] = self.m_attributes[i] >> 24
+
+                self.m_event = const.START_TAG
+                break
+
+            if h.type == const.RES_XML_END_ELEMENT_TYPE:
+                self.m_namespaceUri, = unpack('<L', self.buff.read(4))
+                self.m_name, = unpack('<L', self.buff.read(4))
+
+                self.m_event = const.END_TAG
+                break
+
+            if h.type == const.RES_XML_CDATA_TYPE:
+                # The CDATA field is like an attribute.
+                # It contains an index into the String pool
+                # as well as a typed value.
+                # usually, this typed value is set to UNDEFINED
+
+                # ResStringPool_ref data --> uint32_t index
+                self.m_name, = unpack('<L', self.buff.read(4))
+
+                # Res_value typedData:
+                # uint16_t size
+                # uint8_t res0 -> always zero
+                # uint8_t dataType
+                # uint32_t data
+                # For now, we ingore these values
+                size, res0, dataType, data = unpack("<HBBL", self.buff.read(8))
+
+                log.debug(
+                    "found a CDATA Chunk: "
+                    "index={: 6d}, size={: 4d}, res0={: 4d}, "
+                    "dataType={: 4d}, data={: 4d}".format(
+                        self.m_name, size, res0, dataType, data
+                    )
+                )
+
+                self.m_event = const.TEXT
+                break
+
+            # Still here? Looks like we read an unknown XML header, try to skip it...
+            log.warning("Unknown XML Chunk: 0x{:04x}, skipping {} bytes.".format(h.type, h.size))
+            self.buff.set_idx(h.end)
+
+    @property
+    def name(self):
+        """
+        Return the String assosciated with the tag name
+        """
+        if self.m_name == -1 or (self.m_event != const.START_TAG and self.m_event != const.END_TAG):
+            return u''
+
+        return self.sb[self.m_name]
+
+    @property
+    def comment(self):
+        """
+        Return the comment at the current position or None if no comment is given
+
+        This works only for Tags, as the comments of Namespaces are silently dropped.
+        Currently, there is no way of retrieving comments of namespaces.
+        """
+        if self.m_comment_index == 0xFFFFFFFF:
+            return None
+
+        return self.sb[self.m_comment_index]
+
+    @property
+    def namespace(self):
+        """
+        Return the Namespace URI (if any) as a String for the current tag
+        """
+        if self.m_name == -1 or (self.m_event != const.START_TAG and self.m_event != const.END_TAG):
+            return u''
+
+        # No Namespace
+        if self.m_namespaceUri == 0xFFFFFFFF:
+            return u''
+
+        return self.sb[self.m_namespaceUri]
+
+    @property
+    def nsmap(self):
+        """
+        Returns the current namespace mapping as a dictionary
+
+        there are several problems with the map and we try to guess a few
+        things here:
+
+        1) a URI can be mapped by many prefixes, so it is to decide which one to take
+        2) a prefix might map to an empty string (some packers)
+        3) uri+prefix mappings might be included several times
+        4) prefix might be empty
+        """
+
+        NSMAP = dict()
+        # solve 3) by using a set
+        for k, v in set(self.namespaces):
+            s_prefix = self.sb[k]
+            s_uri = self.sb[v]
+            # Solve 2) & 4) by not including
+            if s_uri != "" and s_prefix != "":
+                # solve 1) by using the last one in the list
+                NSMAP[s_prefix] = s_uri
+
+        return NSMAP
+
+    @property
+    def text(self):
+        """
+        Return the String assosicated with the current text
+        """
+        if self.m_name == -1 or self.m_event != const.TEXT:
+            return u''
+
+        return self.sb[self.m_name]
+
+    def getName(self):
+        """
+        Legacy only!
+        use :py:attr:`~androguard.core.bytecodes.AXMLParser.name` instead
+        """
+        return self.name
+
+    def getText(self):
+        """
+        Legacy only!
+        use :py:attr:`~androguard.core.bytecodes.AXMLParser.text` instead
+        """
+        return self.text
+
+    def getPrefix(self):
+        """
+        Legacy only!
+        use :py:attr:`~androguard.core.bytecodes.AXMLParser.namespace` instead
+        """
+        return self.namespace
+
+    def _get_attribute_offset(self, index):
+        """
+        Return the start inside the m_attributes array for a given attribute
+        """
+        if self.m_event != const.START_TAG:
+            log.warning("Current event is not START_TAG.")
+
+        offset = index * const.ATTRIBUTE_LENGHT
+        if offset >= len(self.m_attributes):
+            log.warning("Invalid attribute index")
+
+        return offset
+
+    def getAttributeCount(self):
+        """
+        Return the number of Attributes for a Tag
+        or -1 if not in a tag
+        """
+        if self.m_event != const.START_TAG:
+            return -1
+
+        return self.m_attribute_count
+
+    def getAttributeUri(self, index):
+        """
+        Returns the numeric ID for the namespace URI of an attribute
+        """
+        offset = self._get_attribute_offset(index)
+        uri = self.m_attributes[offset + const.ATTRIBUTE_IX_NAMESPACE_URI]
+
+        return uri
+
+    def getAttributeNamespace(self, index):
+        """
+        Return the Namespace URI (if any) for the attribute
+        """
+        uri = self.getAttributeUri(index)
+
+        # No Namespace
+        if uri == 0xFFFFFFFF:
+            return u''
+
+        return self.sb[uri]
+
+    def getAttributeName(self, index):
+        """
+        Returns the String which represents the attribute name
+        """
+        offset = self._get_attribute_offset(index)
+        name = self.m_attributes[offset + const.ATTRIBUTE_IX_NAME]
+
+        res = self.sb[name]
+        # If the result is a (null) string, we need to look it up.
+        if not res:
+            attr = self.m_resourceIDs[name]
+            if attr in public.SYSTEM_RESOURCES['attributes']['inverse']:
+                res = 'android:' + public.SYSTEM_RESOURCES['attributes']['inverse'][attr]
+            else:
+                # Attach the HEX Number, so for multiple missing attributes we do not run
+                # into problems.
+                res = 'android:UNKNOWN_SYSTEM_ATTRIBUTE_{:08x}'.format(attr)
+
+        return res
+
+    def getAttributeValueType(self, index):
+        """
+        Return the type of the attribute at the given index
+
+        :param index: index of the attribute
+        """
+        offset = self._get_attribute_offset(index)
+        return self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_TYPE]
+
+    def getAttributeValueData(self, index):
+        """
+        Return the data of the attribute at the given index
+
+        :param index: index of the attribute
+        """
+        offset = self._get_attribute_offset(index)
+        return self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_DATA]
+
+    def getAttributeValue(self, index):
+        """
+        This function is only used to look up strings
+        All other work is done by
+        :func:`~androguard.core.bytecodes.axml.format_value`
+        # FIXME should unite those functions
+        :param index: index of the attribute
+        :return:
+        """
+        offset = self._get_attribute_offset(index)
+        valueType = self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_TYPE]
+        if valueType == const.TYPE_STRING:
+            valueString = self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_STRING]
+            return self.sb[valueString]
+        return u''