Handle HTML with embedded tags

When an element, such as <p>, has a name="translatable" attribute, we also want to grab markup tags inside the element and translate them as well. For example, previously this HTML: <p name="translatable">An embedded <b>tag</b> in a paragraph</p> would result in the following string being extracted: _("An embedded"); However, we want it to be: _("An embedded <b>tag</b> in a paragraph"); This removes the use of BeautifulSoup from the eos-html-extractor script. Unfortunately BeautifulSoup could have done this quite easily, but it does not provide any line number information, which we need. Previously in order to get the line numbers we also used html.parser from Python's standard library, to augment the data we got from BeautifulSoup. However, this issue required html.parser to do all the work that BeautifulSoup did anyway, so there is no reason to use BeautifulSoup anymore. [endlessm/eos-sdk#3291]
author: Philip Chimento <philip@endlessm.com> 2015-06-18 13:58:53 -0700
committer: Philip Chimento <philip@endlessm.com> 2015-06-18 14:04:37 -0700
commit: ac66ca54c46a8701c2e1c78c31225cf1ce5fe2da (patch)
tree: c021e20d52b677fe49a59e1a42fbd5ab62999c99
parent: a2fe0c90e6cbc4d95f772f945833e5dd4255dca8 (diff)
4 files changed, 50 insertions, 15 deletions
diff --git a/test/tools/test.html b/test/tools/test.html
index 6687013..18c07e3 100644
--- a/test/tools/test.html
+++ b/test/tools/test.html
@@ -20,6 +20,7 @@
                 but that doesn't matter to HTML.
             </p>
             <span name="translatable">String with a "quote"</span>
+            <span name="translatable">String with<br>embedded <b>tags</b></span>
         </section>
     </body>
 </html>
diff --git a/test/tools/testHtmlExtractor.js b/test/tools/testHtmlExtractor.js
index 7d622e1..c1bde4e 100644
--- a/test/tools/testHtmlExtractor.js
+++ b/test/tools/testHtmlExtractor.js
@@ -11,7 +11,9 @@ _("Choose a template");\n\
 #line 21 "test/tools/test.html"\n\
 _("This is a string that is spread over multiple lines, but that doesn\'t matter to HTML.");\n\
 #line 22 "test/tools/test.html"\n\
-_("String with a \\"quote\\"");\n';
+_("String with a \\"quote\\"");\n\
+#line 23 "test/tools/test.html"\n\
+_("String with<br>embedded <b>tags</b>");\n';
 
 describe('eos-html-extractor', function () {
     it('works correctly at a minimum', function () {
diff --git a/test/webhelper/testTranslate2.js b/test/webhelper/testTranslate2.js
index 9cb7042..12a18f6 100644
--- a/test/webhelper/testTranslate2.js
+++ b/test/webhelper/testTranslate2.js
@@ -150,6 +150,11 @@ describe('WebHelper2 translator', function () {
             run_loop('<p name="translatable">String with "quotes"</p>');
             expect(gettext_spy).toHaveBeenCalledWith('String with "quotes"');
         });
+
+        it('handles embedded tags correctly', function () {
+            run_loop('<p name="translatable">Embedded<br><b>tags</b></p>');
+            expect(gettext_spy).toHaveBeenCalledWith('Embedded<br><b>tags</b>');
+        });
     });
 
     describe('used from client-side Javascript', function () {
diff --git a/tools/eos-html-extractor b/tools/eos-html-extractor
index c7f87cb..bf50e98 100755
--- a/tools/eos-html-extractor
+++ b/tools/eos-html-extractor
@@ -7,7 +7,6 @@ import io
 import os.path
 import re
 import sys
-from bs4 import BeautifulSoup
 from html.parser import HTMLParser
 
 ESCAPES = str.maketrans({
@@ -20,14 +19,41 @@ def normalize_string(string):
 
 # Parser that adds line numbers to the HTML strings that need translating
 class TranslatableHTMLParser(HTMLParser):
-    def __init__(self, translatable_strings):
+    def __init__(self):
         super().__init__()
         self.all_translatable_data = []
         self._comments_with_line_numbers = []
-        self._translatable_strings = set(translatable_strings)
+        self._current_translatable_tag_level = None
+        self._opened_tags = []
+        self._tag_level = 0
+        self._text = ''
 
-    def handle_data(self, data):
-        if data not in self._translatable_strings:
+    def handle_starttag(self, tag, attrs):
+        self._tag_level += 1
+        self._opened_tags.append(tag)
+
+        if self._current_translatable_tag_level is not None:
+            self._text += self.get_starttag_text()
+            return
+
+        if ('name', 'translatable') in attrs:
+            self._current_translatable_tag_level = self._tag_level
+
+    def handle_endtag(self, tag):
+        self._tag_level -= 1
+        # In non-X HTML, there can be tags that don't close, e.g. <meta>, <br>
+        while self._opened_tags.pop() != tag:
+            self._tag_level -= 1
+
+        if (self._current_translatable_tag_level is not None and
+            self._current_translatable_tag_level >= self._tag_level + 1):
+            self._current_translatable_tag_level = None
+
+        if self._current_translatable_tag_level is not None:
+            self._text += '</' + tag + '>'
+            return
+
+        if not self._text:
             return
 
         code_line = self.getpos()[0]
@@ -45,7 +71,13 @@ class TranslatableHTMLParser(HTMLParser):
             if comment_line + comment_length == code_line:
                 optional_comment = ' '.join(comment_string.split())
 
-        self.all_translatable_data.append((normalize_string(data), code_line, optional_comment))
+        self.all_translatable_data.append((normalize_string(self._text), code_line, optional_comment))
+        self._text = ''
+
+    def handle_data(self, data):
+        if self._current_translatable_tag_level is None:
+            return
+        self._text += data
 
     def handle_comment(self, comment):
         self._comments_with_line_numbers.append((comment, self.getpos()[0]))
@@ -70,17 +102,12 @@ top_dir = args.top_srcdir
 final_path = os.path.relpath(html_file, top_dir)
 out_file = args.output
 
-# Create the BeautifulSoup HTML-parsing object
 with open(html_file, encoding='utf-8') as f:
     page = f.read()
-soup = BeautifulSoup(page)
-
-# Extract all translatable strings from that HTML
-translatable_divs = soup.find_all(attrs={'name': 'translatable'})
-translatable_strings = map(lambda div: div.contents[0], translatable_divs)
 
-# Find the line numbers for those strings
-parser = TranslatableHTMLParser(translatable_strings)
+# Extract all translatable strings from the HTML and find the line numbers for
+# those strings
+parser = TranslatableHTMLParser()
 parser.feed(page)
 
 # Write out all info about the translatable strings found in this file
author	Philip Chimento <philip@endlessm.com>	2015-06-18 13:58:53 -0700
committer	Philip Chimento <philip@endlessm.com>	2015-06-18 14:04:37 -0700
commit	ac66ca54c46a8701c2e1c78c31225cf1ce5fe2da (patch)
tree	c021e20d52b677fe49a59e1a42fbd5ab62999c99
parent	a2fe0c90e6cbc4d95f772f945833e5dd4255dca8 (diff)