summaryrefslogtreecommitdiff
path: root/pyaxmlparser/axmlparser.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaxmlparser/axmlparser.py')
-rw-r--r--pyaxmlparser/axmlparser.py576
1 files changed, 576 insertions, 0 deletions
diff --git a/pyaxmlparser/axmlparser.py b/pyaxmlparser/axmlparser.py
new file mode 100644
index 0000000..53678fa
--- /dev/null
+++ b/pyaxmlparser/axmlparser.py
@@ -0,0 +1,576 @@
+# This file is part of Androguard.
+#
+# Copyright (C) 2012, Anthony Desnos <desnos at t0t0.fr>
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from struct import unpack
+
+import pyaxmlparser.constants as const
+from pyaxmlparser import bytecode
+from pyaxmlparser.stringblock import StringBlock
+from pyaxmlparser.resources import public
+from .arscutil import ARSCHeader
+
+log = logging.getLogger("pyaxmlparser.axmlparser")
+
+
+class AXMLParser(object):
+ """
+ AXMLParser reads through all chunks in the AXML file
+ and implements a state machine to return information about
+ the current chunk, which can then be read by :class:`~AXMLPrinter`.
+
+ An AXML file is a file which contains multiple chunks of data, defined
+ by the `ResChunk_header`.
+ There is no real file magic but as the size of the first header is fixed
+ and the `type` of the `ResChunk_header` is set to `RES_XML_TYPE`, a file
+ will usually start with `0x03000800`.
+ But there are several examples where the `type` is set to something
+ else, probably in order to fool parsers.
+
+ Typically the AXMLParser is used in a loop which terminates if `m_event` is set to `END_DOCUMENT`.
+ You can use the `next()` function to get the next chunk.
+ Note that not all chunk types are yielded from the iterator! Some chunks are processed in
+ the AXMLParser only.
+ The parser will set `is_valid()` to False if it parses something not valid.
+ Messages what is wrong are logged.
+
+ See http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/include/androidfw/ResourceTypes.h#563
+ """
+ def __init__(self, raw_buff):
+ self._reset()
+
+ self._valid = True
+ self.axml_tampered = False
+ self.buff = bytecode.BuffHandle(raw_buff)
+
+ # Minimum is a single ARSCHeader, which would be a strange edge case...
+ if self.buff.size() < 8:
+ log.error("Filesize is too small to be a valid AXML file! Filesize: {}".format(self.buff.size()))
+ self._valid = False
+ return
+
+ # This would be even stranger, if an AXML file is larger than 4GB...
+ # But this is not possible as the maximum chunk size is a unsigned 4 byte int.
+ if self.buff.size() > 0xFFFFFFFF:
+ log.error("Filesize is too large to be a valid AXML file! Filesize: {}".format(self.buff.size()))
+ self._valid = False
+ return
+
+ try:
+ axml_header = ARSCHeader(self.buff)
+ except AssertionError as e:
+ log.error("Error parsing first resource header: %s", e)
+ self._valid = False
+ return
+
+ self.filesize = axml_header.size
+
+ if axml_header.header_size == 28024:
+ # Can be a common error: the file is not an AXML but a plain XML
+ # The file will then usually start with '<?xm' / '3C 3F 78 6D'
+ log.warning("Header size is 28024! Are you trying to parse a plain XML file?")
+
+ if axml_header.header_size != 8:
+ log.error(
+ "This does not look like an AXML file. "
+ "header size does not equal 8! header size = {}".format(
+ axml_header.header_size
+ )
+ )
+ self._valid = False
+ return
+
+ if self.filesize > self.buff.size():
+ log.error(
+ "This does not look like an AXML file. "
+ "Declared filesize does not match real size: {} vs {}".format(
+ self.filesize, self.buff.size()
+ )
+ )
+ self._valid = False
+ return
+
+ if self.filesize < self.buff.size():
+ # The file can still be parsed up to the point where the chunk should end.
+ self.axml_tampered = True
+ log.warning(
+ "Declared filesize ({}) is smaller than total file size ({}). "
+ "Was something appended to the file? Trying to parse it anyways.".format(
+ self.filesize, self.buff.size()
+ )
+ )
+
+ # Not that severe of an error, we have plenty files where this is not
+ # set correctly
+ if axml_header.type != const.RES_XML_TYPE:
+ self.axml_tampered = True
+ log.warning(
+ "AXML file has an unusual resource type! "
+ "Malware likes to to such stuff to anti androguard! "
+ "But we try to parse it anyways. "
+ "Resource Type: 0x{:04x}".format(axml_header.type)
+ )
+
+ # Now we parse the STRING POOL
+ try:
+ header = ARSCHeader(self.buff)
+ except AssertionError as e:
+ log.error("Error parsing resource header of string pool: %s", e)
+ self._valid = False
+ return
+
+ if header.header_size != 0x1C:
+ log.error(
+ "This does not look like an AXML file. String chunk header "
+ "size does not equal 28! header size = {}".format(
+ header.header_size
+ )
+ )
+ self._valid = False
+ return
+
+ if header.type != const.RES_STRING_POOL_TYPE:
+ log.error(
+ "Expected String Pool header, got resource type 0x{:04x} "
+ "instead".format(header.type)
+ )
+ self._valid = False
+ return
+
+ self.sb = StringBlock(self.buff, header)
+
+ # Stores resource ID mappings, if any
+ self.m_resourceIDs = []
+
+ # Store a list of prefix/uri mappings encountered
+ self.namespaces = []
+
+ def is_valid(self):
+ """
+ Get the state of the AXMLPrinter.
+ if an error happend somewhere in the process of parsing the file,
+ this flag is set to False.
+ """
+ return self._valid
+
+ def _reset(self):
+ self.m_event = -1
+ self.m_lineNumber = -1
+ self.m_name = -1
+ self.m_namespaceUri = -1
+ self.m_attributes = []
+ self.m_idAttribute = -1
+ self.m_classAttribute = -1
+ self.m_styleAttribute = -1
+
+ def __next__(self):
+ self._do_next()
+ return self.m_event
+
+ next = __next__ # For Python 2 compatibility
+
+ def _do_next(self):
+ if self.m_event == const.END_DOCUMENT:
+ return
+
+ self._reset()
+ while self._valid:
+ # Stop at the declared filesize or at the end of the file
+ if self.buff.end() or self.buff.get_idx() == self.filesize:
+ self.m_event = const.END_DOCUMENT
+ break
+
+ # Again, we read an ARSCHeader
+ try:
+ h = ARSCHeader(self.buff)
+ except AssertionError as e:
+ log.error("Error parsing resource header: %s", e)
+ self._valid = False
+ return
+
+ # Special chunk: Resource Map. This chunk might be contained inside
+ # the file, after the string pool.
+ if h.type == const.RES_XML_RESOURCE_MAP_TYPE:
+ log.debug("AXML contains a RESOURCE MAP")
+ # Check size: < 8 bytes mean that the chunk is not complete
+ # Should be aligned to 4 bytes.
+ if h.size < 8 or (h.size % 4) != 0:
+ log.error("Invalid chunk size in chunk XML_RESOURCE_MAP")
+ self._valid = False
+ return
+
+ for i in range((h.size - h.header_size) // 4):
+ self.m_resourceIDs.append(unpack('<L', self.buff.read(4))[0])
+
+ continue
+
+ # Parse now the XML chunks.
+ # unknown chunk types might cause problems, but we can skip them!
+ if h.type < const.RES_XML_FIRST_CHUNK_TYPE or h.type > const.RES_XML_LAST_CHUNK_TYPE:
+ # h.size is the size of the whole chunk including the header.
+ # We read already 8 bytes of the header, thus we need to
+ # subtract them.
+ log.error("Not a XML resource chunk type: 0x{:04x}. Skipping {} bytes".format(h.type, h.size))
+ self.buff.set_idx(h.end)
+ continue
+
+ # Check that we read a correct header
+ if h.header_size != 0x10:
+ log.error(
+ "XML Resource Type Chunk header size does not match 16! "
+ "At chunk type 0x{:04x}, declared header size={}, "
+ "chunk size={}".format(h.type, h.header_size, h.size)
+ )
+ self._valid = False
+ return
+
+ # Line Number of the source file, only used as meta information
+ self.m_lineNumber, = unpack('<L', self.buff.read(4))
+
+ # Comment_Index (usually 0xFFFFFFFF)
+ self.m_comment_index, = unpack('<L', self.buff.read(4))
+
+ if self.m_comment_index != 0xFFFFFFFF and h.type in [
+ const.RES_XML_START_NAMESPACE_TYPE,
+ const.RES_XML_END_NAMESPACE_TYPE]:
+ log.warning("Unhandled Comment at namespace chunk: '{}'".format(
+ self.sb[self.m_comment_index])
+ )
+
+ if h.type == const.RES_XML_START_NAMESPACE_TYPE:
+ prefix, = unpack('<L', self.buff.read(4))
+ uri, = unpack('<L', self.buff.read(4))
+
+ s_prefix = self.sb[prefix]
+ s_uri = self.sb[uri]
+
+ log.debug(
+ "Start of Namespace mapping: prefix "
+ "{}: '{}' --> uri {}: '{}'".format(
+ prefix, s_prefix, uri, s_uri
+ )
+ )
+
+ if s_uri == '':
+ log.warning("Namespace prefix '{}' resolves to empty URI. "
+ "This might be a packer.".format(s_prefix))
+
+ if (prefix, uri) in self.namespaces:
+ log.info(
+ "Namespace mapping ({}, {}) already seen! "
+ "This is usually not a problem but could indicate "
+ "packers or broken AXML compilers.".format(prefix, uri))
+ self.namespaces.append((prefix, uri))
+
+ # We can continue with the next chunk, as we store the namespace
+ # mappings for each tag
+ continue
+
+ if h.type == const.RES_XML_END_NAMESPACE_TYPE:
+ # END_PREFIX contains again prefix and uri field
+ prefix, = unpack('<L', self.buff.read(4))
+ uri, = unpack('<L', self.buff.read(4))
+
+ # We remove the last namespace mapping matching
+ if (prefix, uri) in self.namespaces:
+ self.namespaces.remove((prefix, uri))
+ else:
+ log.warning(
+ "Reached a NAMESPACE_END without having the namespace stored before? "
+ "Prefix ID: {}, URI ID: {}".format(prefix, uri)
+ )
+
+ # We can continue with the next chunk, as we store the namespace
+ # mappings for each tag
+ continue
+
+ # START_TAG is the start of a new tag.
+ if h.type == const.RES_XML_START_ELEMENT_TYPE:
+ # The TAG consists of some fields:
+ # * (chunk_size, line_number, comment_index - we read before)
+ # * namespace_uri
+ # * name
+ # * flags
+ # * attribute_count
+ # * class_attribute
+ # After that, there are two lists of attributes, 20 bytes each
+
+ # Namespace URI (String ID)
+ self.m_namespaceUri, = unpack('<L', self.buff.read(4))
+ # Name of the Tag (String ID)
+ self.m_name, = unpack('<L', self.buff.read(4))
+ # FIXME: Flags
+ _ = self.buff.read(4) # noqa
+ # Attribute Count
+ attributeCount, = unpack('<L', self.buff.read(4))
+ # Class Attribute
+ self.m_classAttribute, = unpack('<L', self.buff.read(4))
+
+ self.m_idAttribute = (attributeCount >> 16) - 1
+ self.m_attribute_count = attributeCount & 0xFFFF
+ self.m_styleAttribute = (self.m_classAttribute >> 16) - 1
+ self.m_classAttribute = (self.m_classAttribute & 0xFFFF) - 1
+
+ # Now, we parse the attributes.
+ # Each attribute has 5 fields of 4 byte
+ for i in range(0, self.m_attribute_count * const.ATTRIBUTE_LENGHT):
+ # Each field is linearly parsed into the array
+ # Each Attribute contains:
+ # * Namespace URI (String ID)
+ # * Name (String ID)
+ # * Value
+ # * Type
+ # * Data
+ self.m_attributes.append(unpack('<L', self.buff.read(4))[0])
+
+ # Then there are class_attributes
+ for i in range(const.ATTRIBUTE_IX_VALUE_TYPE, len(self.m_attributes), const.ATTRIBUTE_LENGHT):
+ self.m_attributes[i] = self.m_attributes[i] >> 24
+
+ self.m_event = const.START_TAG
+ break
+
+ if h.type == const.RES_XML_END_ELEMENT_TYPE:
+ self.m_namespaceUri, = unpack('<L', self.buff.read(4))
+ self.m_name, = unpack('<L', self.buff.read(4))
+
+ self.m_event = const.END_TAG
+ break
+
+ if h.type == const.RES_XML_CDATA_TYPE:
+ # The CDATA field is like an attribute.
+ # It contains an index into the String pool
+ # as well as a typed value.
+ # usually, this typed value is set to UNDEFINED
+
+ # ResStringPool_ref data --> uint32_t index
+ self.m_name, = unpack('<L', self.buff.read(4))
+
+ # Res_value typedData:
+ # uint16_t size
+ # uint8_t res0 -> always zero
+ # uint8_t dataType
+ # uint32_t data
+ # For now, we ingore these values
+ size, res0, dataType, data = unpack("<HBBL", self.buff.read(8))
+
+ log.debug(
+ "found a CDATA Chunk: "
+ "index={: 6d}, size={: 4d}, res0={: 4d}, "
+ "dataType={: 4d}, data={: 4d}".format(
+ self.m_name, size, res0, dataType, data
+ )
+ )
+
+ self.m_event = const.TEXT
+ break
+
+ # Still here? Looks like we read an unknown XML header, try to skip it...
+ log.warning("Unknown XML Chunk: 0x{:04x}, skipping {} bytes.".format(h.type, h.size))
+ self.buff.set_idx(h.end)
+
+ @property
+ def name(self):
+ """
+ Return the String assosciated with the tag name
+ """
+ if self.m_name == -1 or (self.m_event != const.START_TAG and self.m_event != const.END_TAG):
+ return u''
+
+ return self.sb[self.m_name]
+
+ @property
+ def comment(self):
+ """
+ Return the comment at the current position or None if no comment is given
+
+ This works only for Tags, as the comments of Namespaces are silently dropped.
+ Currently, there is no way of retrieving comments of namespaces.
+ """
+ if self.m_comment_index == 0xFFFFFFFF:
+ return None
+
+ return self.sb[self.m_comment_index]
+
+ @property
+ def namespace(self):
+ """
+ Return the Namespace URI (if any) as a String for the current tag
+ """
+ if self.m_name == -1 or (self.m_event != const.START_TAG and self.m_event != const.END_TAG):
+ return u''
+
+ # No Namespace
+ if self.m_namespaceUri == 0xFFFFFFFF:
+ return u''
+
+ return self.sb[self.m_namespaceUri]
+
+ @property
+ def nsmap(self):
+ """
+ Returns the current namespace mapping as a dictionary
+
+ there are several problems with the map and we try to guess a few
+ things here:
+
+ 1) a URI can be mapped by many prefixes, so it is to decide which one to take
+ 2) a prefix might map to an empty string (some packers)
+ 3) uri+prefix mappings might be included several times
+ 4) prefix might be empty
+ """
+
+ NSMAP = dict()
+ # solve 3) by using a set
+ for k, v in set(self.namespaces):
+ s_prefix = self.sb[k]
+ s_uri = self.sb[v]
+ # Solve 2) & 4) by not including
+ if s_uri != "" and s_prefix != "":
+ # solve 1) by using the last one in the list
+ NSMAP[s_prefix] = s_uri
+
+ return NSMAP
+
+ @property
+ def text(self):
+ """
+ Return the String assosicated with the current text
+ """
+ if self.m_name == -1 or self.m_event != const.TEXT:
+ return u''
+
+ return self.sb[self.m_name]
+
+ def getName(self):
+ """
+ Legacy only!
+ use :py:attr:`~androguard.core.bytecodes.AXMLParser.name` instead
+ """
+ return self.name
+
+ def getText(self):
+ """
+ Legacy only!
+ use :py:attr:`~androguard.core.bytecodes.AXMLParser.text` instead
+ """
+ return self.text
+
+ def getPrefix(self):
+ """
+ Legacy only!
+ use :py:attr:`~androguard.core.bytecodes.AXMLParser.namespace` instead
+ """
+ return self.namespace
+
+ def _get_attribute_offset(self, index):
+ """
+ Return the start inside the m_attributes array for a given attribute
+ """
+ if self.m_event != const.START_TAG:
+ log.warning("Current event is not START_TAG.")
+
+ offset = index * const.ATTRIBUTE_LENGHT
+ if offset >= len(self.m_attributes):
+ log.warning("Invalid attribute index")
+
+ return offset
+
+ def getAttributeCount(self):
+ """
+ Return the number of Attributes for a Tag
+ or -1 if not in a tag
+ """
+ if self.m_event != const.START_TAG:
+ return -1
+
+ return self.m_attribute_count
+
+ def getAttributeUri(self, index):
+ """
+ Returns the numeric ID for the namespace URI of an attribute
+ """
+ offset = self._get_attribute_offset(index)
+ uri = self.m_attributes[offset + const.ATTRIBUTE_IX_NAMESPACE_URI]
+
+ return uri
+
+ def getAttributeNamespace(self, index):
+ """
+ Return the Namespace URI (if any) for the attribute
+ """
+ uri = self.getAttributeUri(index)
+
+ # No Namespace
+ if uri == 0xFFFFFFFF:
+ return u''
+
+ return self.sb[uri]
+
+ def getAttributeName(self, index):
+ """
+ Returns the String which represents the attribute name
+ """
+ offset = self._get_attribute_offset(index)
+ name = self.m_attributes[offset + const.ATTRIBUTE_IX_NAME]
+
+ res = self.sb[name]
+ # If the result is a (null) string, we need to look it up.
+ if not res:
+ attr = self.m_resourceIDs[name]
+ if attr in public.SYSTEM_RESOURCES['attributes']['inverse']:
+ res = 'android:' + public.SYSTEM_RESOURCES['attributes']['inverse'][attr]
+ else:
+ # Attach the HEX Number, so for multiple missing attributes we do not run
+ # into problems.
+ res = 'android:UNKNOWN_SYSTEM_ATTRIBUTE_{:08x}'.format(attr)
+
+ return res
+
+ def getAttributeValueType(self, index):
+ """
+ Return the type of the attribute at the given index
+
+ :param index: index of the attribute
+ """
+ offset = self._get_attribute_offset(index)
+ return self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_TYPE]
+
+ def getAttributeValueData(self, index):
+ """
+ Return the data of the attribute at the given index
+
+ :param index: index of the attribute
+ """
+ offset = self._get_attribute_offset(index)
+ return self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_DATA]
+
+ def getAttributeValue(self, index):
+ """
+ This function is only used to look up strings
+ All other work is done by
+ :func:`~androguard.core.bytecodes.axml.format_value`
+ # FIXME should unite those functions
+ :param index: index of the attribute
+ :return:
+ """
+ offset = self._get_attribute_offset(index)
+ valueType = self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_TYPE]
+ if valueType == const.TYPE_STRING:
+ valueString = self.m_attributes[offset + const.ATTRIBUTE_IX_VALUE_STRING]
+ return self.sb[valueString]
+ return u''