summaryrefslogtreecommitdiff
path: root/pyaxmlparser/stringblock.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaxmlparser/stringblock.py')
-rw-r--r--pyaxmlparser/stringblock.py272
1 files changed, 272 insertions, 0 deletions
diff --git a/pyaxmlparser/stringblock.py b/pyaxmlparser/stringblock.py
new file mode 100644
index 0000000..3a008f4
--- /dev/null
+++ b/pyaxmlparser/stringblock.py
@@ -0,0 +1,272 @@
+# This file is part of Androguard.
+#
+# Copyright (C) 2012, Anthony Desnos <desnos at t0t0.fr>
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS-IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from struct import unpack
+
+import pyaxmlparser.constants as const
+
+
+log = logging.getLogger("pyaxmlparser.stringblock")
+
+
+class StringBlock(object):
+ """
+ StringBlock is a CHUNK inside an AXML File
+ It contains all strings, which are used by referecing to ID's
+
+ See http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/include/androidfw/ResourceTypes.h#436
+ """
+ def __init__(self, buff, header):
+ """
+ :param buff: buffer which holds the string block
+ :param header: a instance of :class:`~ARSCHeader`
+ """
+ self._cache = {}
+ self.header = header
+ # We already read the header (which was chunk_type and chunk_size
+ # Now, we read the string_count:
+ self.stringCount = unpack('<I', buff.read(4))[0]
+ # style_count
+ self.styleCount = unpack('<I', buff.read(4))[0]
+
+ # flags
+ self.flags = unpack('<I', buff.read(4))[0]
+ self.m_isUTF8 = ((self.flags & const.UTF8_FLAG) != 0)
+
+ # string_pool_offset
+ # The string offset is counted from the beginning of the string section
+ self.stringsOffset = unpack('<I', buff.read(4))[0]
+ # style_pool_offset
+ # The styles offset is counted as well from the beginning of the string section
+ self.stylesOffset = unpack('<I', buff.read(4))[0]
+
+ # Check if they supplied a stylesOffset even if the count is 0:
+ if self.styleCount == 0 and self.stylesOffset > 0:
+ log.info("Styles Offset given, but styleCount is zero. "
+ "This is not a problem but could indicate packers.")
+
+ self.m_stringOffsets = []
+ self.m_styleOffsets = []
+ self.m_charbuff = ""
+ self.m_styles = []
+
+ # Next, there is a list of string following.
+ # This is only a list of offsets (4 byte each)
+ for i in range(self.stringCount):
+ self.m_stringOffsets.append(unpack('<I', buff.read(4))[0])
+
+ # And a list of styles
+ # again, a list of offsets
+ for i in range(self.styleCount):
+ self.m_styleOffsets.append(unpack('<I', buff.read(4))[0])
+
+ # FIXME it is probably better to parse n strings and not calculate the size
+ size = self.header.size - self.stringsOffset
+
+ # if there are styles as well, we do not want to read them too.
+ # Only read them, if no
+ if self.stylesOffset != 0 and self.styleCount != 0:
+ size = self.stylesOffset - self.stringsOffset
+
+ if (size % 4) != 0:
+ log.warning("Size of strings is not aligned by four bytes.")
+
+ self.m_charbuff = buff.read(size)
+
+ if self.stylesOffset != 0 and self.styleCount != 0:
+ size = self.header.size - self.stylesOffset
+
+ if (size % 4) != 0:
+ log.warning("Size of styles is not aligned by four bytes.")
+
+ for i in range(0, size // 4):
+ self.m_styles.append(unpack('<I', buff.read(4))[0])
+
+ def __getitem__(self, idx):
+ """
+ Returns the string at the index in the string table
+ """
+ return self.getString(idx)
+
+ def __len__(self):
+ """
+ Get the number of strings stored in this table
+ """
+ return self.stringCount
+
+ def __iter__(self):
+ """
+ Iterable over all strings
+ """
+ for i in range(self.stringCount):
+ yield self.getString(i)
+
+ def getString(self, idx):
+ """
+ Return the string at the index in the string table
+
+ :param idx: index in the string table
+ :return: str
+ """
+ if idx in self._cache:
+ return self._cache[idx]
+
+ if idx < 0 or not self.m_stringOffsets or idx > self.stringCount:
+ return ""
+
+ offset = self.m_stringOffsets[idx]
+
+ if self.m_isUTF8:
+ self._cache[idx] = self._decode8(offset)
+ else:
+ self._cache[idx] = self._decode16(offset)
+
+ return self._cache[idx]
+
+ def getStyle(self, idx):
+ """
+ Return the style associated with the index
+
+ :param idx: index of the style
+ :return:
+ """
+ return self.m_styles[idx]
+
+ def _decode8(self, offset):
+ """
+ Decode an UTF-8 String at the given offset
+
+ :param offset: offset of the string inside the data
+ :return: str
+ """
+ # UTF-8 Strings contain two lengths, as they might differ:
+ # 1) the UTF-16 length
+ str_len, skip = self._decode_length(offset, 1)
+ offset += skip
+
+ # 2) the utf-8 string length
+ encoded_bytes, skip = self._decode_length(offset, 1)
+ offset += skip
+
+ data = self.m_charbuff[offset: offset + encoded_bytes]
+
+ assert self.m_charbuff[offset + encoded_bytes] == 0, \
+ "UTF-8 String is not null terminated! At offset={}".format(offset)
+
+ return self._decode_bytes(data, 'utf-8', str_len)
+
+ def _decode16(self, offset):
+ """
+ Decode an UTF-16 String at the given offset
+
+ :param offset: offset of the string inside the data
+ :return: str
+ """
+ str_len, skip = self._decode_length(offset, 2)
+ offset += skip
+
+ # The len is the string len in utf-16 units
+ encoded_bytes = str_len * 2
+
+ data = self.m_charbuff[offset: offset + encoded_bytes]
+
+ assert self.m_charbuff[offset + encoded_bytes:offset + encoded_bytes + 2] == b"\x00\x00", \
+ "UTF-16 String is not null terminated! At offset={}".format(offset)
+
+ return self._decode_bytes(data, 'utf-16', str_len)
+
+ @staticmethod
+ def _decode_bytes(data, encoding, str_len):
+ """
+ Generic decoding with length check.
+ The string is decoded from bytes with the given encoding, then the length
+ of the string is checked.
+ The string is decoded using the "replace" method.
+
+ :param data: bytes
+ :param encoding: encoding name ("utf-8" or "utf-16")
+ :param str_len: length of the decoded string
+ :return: str
+ """
+ string = data.decode(encoding, 'replace')
+ if len(string) != str_len:
+ log.warning("invalid decoded string length")
+ return string
+
+ def _decode_length(self, offset, sizeof_char):
+ """
+ Generic Length Decoding at offset of string
+
+ The method works for both 8 and 16 bit Strings.
+ Length checks are enforced:
+ * 8 bit strings: maximum of 0x7FFF bytes (See
+ http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/ResourceTypes.cpp#692)
+ * 16 bit strings: maximum of 0x7FFFFFF bytes (See
+ http://androidxref.com/9.0.0_r3/xref/frameworks/base/libs/androidfw/ResourceTypes.cpp#670)
+
+ :param offset: offset into the string data section of the beginning of
+ the string
+ :param sizeof_char: number of bytes per char (1 = 8bit, 2 = 16bit)
+ :returns: tuple of (length, read bytes)
+ """
+ sizeof_2chars = sizeof_char << 1
+ fmt = "<2{}".format('B' if sizeof_char == 1 else 'H')
+ highbit = 0x80 << (8 * (sizeof_char - 1))
+
+ length1, length2 = unpack(fmt, self.m_charbuff[offset:(offset + sizeof_2chars)])
+
+ if (length1 & highbit) != 0:
+ length = ((length1 & ~highbit) << (8 * sizeof_char)) | length2
+ size = sizeof_2chars
+ else:
+ length = length1
+ size = sizeof_char
+
+ if sizeof_char == 1:
+ assert length <= 0x7FFF, "length of UTF-8 string is too large! At offset={}".format(offset)
+ else:
+ assert length <= 0x7FFFFFFF, "length of UTF-16 string is too large! At offset={}".format(offset)
+
+ return length, size
+
+ def show(self):
+ """
+ Print some information on stdout about the string table
+ """
+ print("StringBlock(stringsCount=0x%x, "
+ "stringsOffset=0x%x, "
+ "stylesCount=0x%x, "
+ "stylesOffset=0x%x, "
+ "flags=0x%x"
+ ")" % (self.stringCount,
+ self.stringsOffset,
+ self.styleCount,
+ self.stylesOffset,
+ self.flags))
+
+ if self.stringCount > 0:
+ print()
+ print("String Table: ")
+ for i, s in enumerate(self):
+ print("{:08d} {}".format(i, repr(s)))
+
+ if self.styleCount > 0:
+ print()
+ print("Styles Table: ")
+ for i in range(self.styleCount):
+ print("{:08d} {}".format(i, repr(self.getStyle(i))))