diff options
Diffstat (limited to 'pyaxmlparser/stringblock.py')
-rw-r--r-- | pyaxmlparser/stringblock.py | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/pyaxmlparser/stringblock.py b/pyaxmlparser/stringblock.py new file mode 100644 index 0000000..3a008f4 --- /dev/null +++ b/pyaxmlparser/stringblock.py @@ -0,0 +1,272 @@ +# This file is part of Androguard. +# +# Copyright (C) 2012, Anthony Desnos <desnos at t0t0.fr> +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS-IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from struct import unpack + +import pyaxmlparser.constants as const + + +log = logging.getLogger("pyaxmlparser.stringblock") + + +class StringBlock(object): + """ + StringBlock is a CHUNK inside an AXML File + It contains all strings, which are used by referecing to ID's + + See http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/include/androidfw/ResourceTypes.h#436 + """ + def __init__(self, buff, header): + """ + :param buff: buffer which holds the string block + :param header: a instance of :class:`~ARSCHeader` + """ + self._cache = {} + self.header = header + # We already read the header (which was chunk_type and chunk_size + # Now, we read the string_count: + self.stringCount = unpack('<I', buff.read(4))[0] + # style_count + self.styleCount = unpack('<I', buff.read(4))[0] + + # flags + self.flags = unpack('<I', buff.read(4))[0] + self.m_isUTF8 = ((self.flags & const.UTF8_FLAG) != 0) + + # string_pool_offset + # The string offset is counted from the beginning of the string section + self.stringsOffset = unpack('<I', buff.read(4))[0] + # style_pool_offset + # The styles offset is counted as well from the beginning of the string section + self.stylesOffset = unpack('<I', buff.read(4))[0] + + # Check if they supplied a stylesOffset even if the count is 0: + if self.styleCount == 0 and self.stylesOffset > 0: + log.info("Styles Offset given, but styleCount is zero. " + "This is not a problem but could indicate packers.") + + self.m_stringOffsets = [] + self.m_styleOffsets = [] + self.m_charbuff = "" + self.m_styles = [] + + # Next, there is a list of string following. + # This is only a list of offsets (4 byte each) + for i in range(self.stringCount): + self.m_stringOffsets.append(unpack('<I', buff.read(4))[0]) + + # And a list of styles + # again, a list of offsets + for i in range(self.styleCount): + self.m_styleOffsets.append(unpack('<I', buff.read(4))[0]) + + # FIXME it is probably better to parse n strings and not calculate the size + size = self.header.size - self.stringsOffset + + # if there are styles as well, we do not want to read them too. + # Only read them, if no + if self.stylesOffset != 0 and self.styleCount != 0: + size = self.stylesOffset - self.stringsOffset + + if (size % 4) != 0: + log.warning("Size of strings is not aligned by four bytes.") + + self.m_charbuff = buff.read(size) + + if self.stylesOffset != 0 and self.styleCount != 0: + size = self.header.size - self.stylesOffset + + if (size % 4) != 0: + log.warning("Size of styles is not aligned by four bytes.") + + for i in range(0, size // 4): + self.m_styles.append(unpack('<I', buff.read(4))[0]) + + def __getitem__(self, idx): + """ + Returns the string at the index in the string table + """ + return self.getString(idx) + + def __len__(self): + """ + Get the number of strings stored in this table + """ + return self.stringCount + + def __iter__(self): + """ + Iterable over all strings + """ + for i in range(self.stringCount): + yield self.getString(i) + + def getString(self, idx): + """ + Return the string at the index in the string table + + :param idx: index in the string table + :return: str + """ + if idx in self._cache: + return self._cache[idx] + + if idx < 0 or not self.m_stringOffsets or idx > self.stringCount: + return "" + + offset = self.m_stringOffsets[idx] + + if self.m_isUTF8: + self._cache[idx] = self._decode8(offset) + else: + self._cache[idx] = self._decode16(offset) + + return self._cache[idx] + + def getStyle(self, idx): + """ + Return the style associated with the index + + :param idx: index of the style + :return: + """ + return self.m_styles[idx] + + def _decode8(self, offset): + """ + Decode an UTF-8 String at the given offset + + :param offset: offset of the string inside the data + :return: str + """ + # UTF-8 Strings contain two lengths, as they might differ: + # 1) the UTF-16 length + str_len, skip = self._decode_length(offset, 1) + offset += skip + + # 2) the utf-8 string length + encoded_bytes, skip = self._decode_length(offset, 1) + offset += skip + + data = self.m_charbuff[offset: offset + encoded_bytes] + + assert self.m_charbuff[offset + encoded_bytes] == 0, \ + "UTF-8 String is not null terminated! At offset={}".format(offset) + + return self._decode_bytes(data, 'utf-8', str_len) + + def _decode16(self, offset): + """ + Decode an UTF-16 String at the given offset + + :param offset: offset of the string inside the data + :return: str + """ + str_len, skip = self._decode_length(offset, 2) + offset += skip + + # The len is the string len in utf-16 units + encoded_bytes = str_len * 2 + + data = self.m_charbuff[offset: offset + encoded_bytes] + + assert self.m_charbuff[offset + encoded_bytes:offset + encoded_bytes + 2] == b"\x00\x00", \ + "UTF-16 String is not null terminated! At offset={}".format(offset) + + return self._decode_bytes(data, 'utf-16', str_len) + + @staticmethod + def _decode_bytes(data, encoding, str_len): + """ + Generic decoding with length check. + The string is decoded from bytes with the given encoding, then the length + of the string is checked. + The string is decoded using the "replace" method. + + :param data: bytes + :param encoding: encoding name ("utf-8" or "utf-16") + :param str_len: length of the decoded string + :return: str + """ + string = data.decode(encoding, 'replace') + if len(string) != str_len: + log.warning("invalid decoded string length") + return string + + def _decode_length(self, offset, sizeof_char): + """ + Generic Length Decoding at offset of string + + The method works for both 8 and 16 bit Strings. + Length checks are enforced: + * 8 bit strings: maximum of 0x7FFF bytes (See + http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/ResourceTypes.cpp#692) + * 16 bit strings: maximum of 0x7FFFFFF bytes (See + http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/ResourceTypes.cpp#670) + + :param offset: offset into the string data section of the beginning of + the string + :param sizeof_char: number of bytes per char (1 = 8bit, 2 = 16bit) + :returns: tuple of (length, read bytes) + """ + sizeof_2chars = sizeof_char << 1 + fmt = "<2{}".format('B' if sizeof_char == 1 else 'H') + highbit = 0x80 << (8 * (sizeof_char - 1)) + + length1, length2 = unpack(fmt, self.m_charbuff[offset:(offset + sizeof_2chars)]) + + if (length1 & highbit) != 0: + length = ((length1 & ~highbit) << (8 * sizeof_char)) | length2 + size = sizeof_2chars + else: + length = length1 + size = sizeof_char + + if sizeof_char == 1: + assert length <= 0x7FFF, "length of UTF-8 string is too large! At offset={}".format(offset) + else: + assert length <= 0x7FFFFFFF, "length of UTF-16 string is too large! At offset={}".format(offset) + + return length, size + + def show(self): + """ + Print some information on stdout about the string table + """ + print("StringBlock(stringsCount=0x%x, " + "stringsOffset=0x%x, " + "stylesCount=0x%x, " + "stylesOffset=0x%x, " + "flags=0x%x" + ")" % (self.stringCount, + self.stringsOffset, + self.styleCount, + self.stylesOffset, + self.flags)) + + if self.stringCount > 0: + print() + print("String Table: ") + for i, s in enumerate(self): + print("{:08d} {}".format(i, repr(s))) + + if self.styleCount > 0: + print() + print("Styles Table: ") + for i in range(self.styleCount): + print("{:08d} {}".format(i, repr(self.getStyle(i)))) |