diff options
author | Philip Chimento <philip@endlessm.com> | 2015-06-18 13:58:53 -0700 |
---|---|---|
committer | Philip Chimento <philip@endlessm.com> | 2015-06-18 14:04:37 -0700 |
commit | ac66ca54c46a8701c2e1c78c31225cf1ce5fe2da (patch) | |
tree | c021e20d52b677fe49a59e1a42fbd5ab62999c99 /tools/eos-html-extractor | |
parent | a2fe0c90e6cbc4d95f772f945833e5dd4255dca8 (diff) |
Handle HTML with embedded tags
When an element, such as <p>, has a name="translatable" attribute, we
also want to grab markup tags inside the element and translate them as
well.
For example, previously this HTML:
<p name="translatable">An embedded <b>tag</b> in a paragraph</p>
would result in the following string being extracted:
_("An embedded");
However, we want it to be:
_("An embedded <b>tag</b> in a paragraph");
This removes the use of BeautifulSoup from the eos-html-extractor script.
Unfortunately BeautifulSoup could have done this quite easily, but it
does not provide any line number information, which we need. Previously
in order to get the line numbers we also used html.parser from Python's
standard library, to augment the data we got from BeautifulSoup. However,
this issue required html.parser to do all the work that BeautifulSoup did
anyway, so there is no reason to use BeautifulSoup anymore.
[endlessm/eos-sdk#3291]
Diffstat (limited to 'tools/eos-html-extractor')
-rwxr-xr-x | tools/eos-html-extractor | 55 |
1 files changed, 41 insertions, 14 deletions
diff --git a/tools/eos-html-extractor b/tools/eos-html-extractor index c7f87cb..bf50e98 100755 --- a/tools/eos-html-extractor +++ b/tools/eos-html-extractor @@ -7,7 +7,6 @@ import io import os.path import re import sys -from bs4 import BeautifulSoup from html.parser import HTMLParser ESCAPES = str.maketrans({ @@ -20,14 +19,41 @@ def normalize_string(string): # Parser that adds line numbers to the HTML strings that need translating class TranslatableHTMLParser(HTMLParser): - def __init__(self, translatable_strings): + def __init__(self): super().__init__() self.all_translatable_data = [] self._comments_with_line_numbers = [] - self._translatable_strings = set(translatable_strings) + self._current_translatable_tag_level = None + self._opened_tags = [] + self._tag_level = 0 + self._text = '' - def handle_data(self, data): - if data not in self._translatable_strings: + def handle_starttag(self, tag, attrs): + self._tag_level += 1 + self._opened_tags.append(tag) + + if self._current_translatable_tag_level is not None: + self._text += self.get_starttag_text() + return + + if ('name', 'translatable') in attrs: + self._current_translatable_tag_level = self._tag_level + + def handle_endtag(self, tag): + self._tag_level -= 1 + # In non-X HTML, there can be tags that don't close, e.g. <meta>, <br> + while self._opened_tags.pop() != tag: + self._tag_level -= 1 + + if (self._current_translatable_tag_level is not None and + self._current_translatable_tag_level >= self._tag_level + 1): + self._current_translatable_tag_level = None + + if self._current_translatable_tag_level is not None: + self._text += '</' + tag + '>' + return + + if not self._text: return code_line = self.getpos()[0] @@ -45,7 +71,13 @@ class TranslatableHTMLParser(HTMLParser): if comment_line + comment_length == code_line: optional_comment = ' '.join(comment_string.split()) - self.all_translatable_data.append((normalize_string(data), code_line, optional_comment)) + self.all_translatable_data.append((normalize_string(self._text), code_line, optional_comment)) + self._text = '' + + def handle_data(self, data): + if self._current_translatable_tag_level is None: + return + self._text += data def handle_comment(self, comment): self._comments_with_line_numbers.append((comment, self.getpos()[0])) @@ -70,17 +102,12 @@ top_dir = args.top_srcdir final_path = os.path.relpath(html_file, top_dir) out_file = args.output -# Create the BeautifulSoup HTML-parsing object with open(html_file, encoding='utf-8') as f: page = f.read() -soup = BeautifulSoup(page) - -# Extract all translatable strings from that HTML -translatable_divs = soup.find_all(attrs={'name': 'translatable'}) -translatable_strings = map(lambda div: div.contents[0], translatable_divs) -# Find the line numbers for those strings -parser = TranslatableHTMLParser(translatable_strings) +# Extract all translatable strings from the HTML and find the line numbers for +# those strings +parser = TranslatableHTMLParser() parser.feed(page) # Write out all info about the translatable strings found in this file |