diff options
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/eos-html-extractor | 55 |
1 files changed, 41 insertions, 14 deletions
diff --git a/tools/eos-html-extractor b/tools/eos-html-extractor index c7f87cb..bf50e98 100755 --- a/tools/eos-html-extractor +++ b/tools/eos-html-extractor @@ -7,7 +7,6 @@ import io import os.path import re import sys -from bs4 import BeautifulSoup from html.parser import HTMLParser ESCAPES = str.maketrans({ @@ -20,14 +19,41 @@ def normalize_string(string): # Parser that adds line numbers to the HTML strings that need translating class TranslatableHTMLParser(HTMLParser): - def __init__(self, translatable_strings): + def __init__(self): super().__init__() self.all_translatable_data = [] self._comments_with_line_numbers = [] - self._translatable_strings = set(translatable_strings) + self._current_translatable_tag_level = None + self._opened_tags = [] + self._tag_level = 0 + self._text = '' - def handle_data(self, data): - if data not in self._translatable_strings: + def handle_starttag(self, tag, attrs): + self._tag_level += 1 + self._opened_tags.append(tag) + + if self._current_translatable_tag_level is not None: + self._text += self.get_starttag_text() + return + + if ('name', 'translatable') in attrs: + self._current_translatable_tag_level = self._tag_level + + def handle_endtag(self, tag): + self._tag_level -= 1 + # In non-X HTML, there can be tags that don't close, e.g. <meta>, <br> + while self._opened_tags.pop() != tag: + self._tag_level -= 1 + + if (self._current_translatable_tag_level is not None and + self._current_translatable_tag_level >= self._tag_level + 1): + self._current_translatable_tag_level = None + + if self._current_translatable_tag_level is not None: + self._text += '</' + tag + '>' + return + + if not self._text: return code_line = self.getpos()[0] @@ -45,7 +71,13 @@ class TranslatableHTMLParser(HTMLParser): if comment_line + comment_length == code_line: optional_comment = ' '.join(comment_string.split()) - self.all_translatable_data.append((normalize_string(data), code_line, optional_comment)) + self.all_translatable_data.append((normalize_string(self._text), code_line, optional_comment)) + self._text = '' + + def handle_data(self, data): + if self._current_translatable_tag_level is None: + return + self._text += data def handle_comment(self, comment): self._comments_with_line_numbers.append((comment, self.getpos()[0])) @@ -70,17 +102,12 @@ top_dir = args.top_srcdir final_path = os.path.relpath(html_file, top_dir) out_file = args.output -# Create the BeautifulSoup HTML-parsing object with open(html_file, encoding='utf-8') as f: page = f.read() -soup = BeautifulSoup(page) - -# Extract all translatable strings from that HTML -translatable_divs = soup.find_all(attrs={'name': 'translatable'}) -translatable_strings = map(lambda div: div.contents[0], translatable_divs) -# Find the line numbers for those strings -parser = TranslatableHTMLParser(translatable_strings) +# Extract all translatable strings from the HTML and find the line numbers for +# those strings +parser = TranslatableHTMLParser() parser.feed(page) # Write out all info about the translatable strings found in this file |