4 files changed, 50 insertions, 15 deletions
diff --git a/test/tools/test.html b/test/tools/test.html
index 6687013..18c07e3 100644
--- a/test/tools/test.html
+++ b/test/tools/test.html
@@ -20,6 +20,7 @@
                 but that doesn't matter to HTML.
             </p>
             <span name="translatable">String with a "quote"</span>
+            <span name="translatable">String with<br>embedded <b>tags</b></span>
         </section>
     </body>
 </html>
diff --git a/test/tools/testHtmlExtractor.js b/test/tools/testHtmlExtractor.js
index 7d622e1..c1bde4e 100644
--- a/test/tools/testHtmlExtractor.js
+++ b/test/tools/testHtmlExtractor.js
@@ -11,7 +11,9 @@ _("Choose a template");\n\
 #line 21 "test/tools/test.html"\n\
 _("This is a string that is spread over multiple lines, but that doesn\'t matter to HTML.");\n\
 #line 22 "test/tools/test.html"\n\
-_("String with a \\"quote\\"");\n';
+_("String with a \\"quote\\"");\n\
+#line 23 "test/tools/test.html"\n\
+_("String with<br>embedded <b>tags</b>");\n';
 
 describe('eos-html-extractor', function () {
     it('works correctly at a minimum', function () {
diff --git a/test/webhelper/testTranslate2.js b/test/webhelper/testTranslate2.js
index 9cb7042..12a18f6 100644
--- a/test/webhelper/testTranslate2.js
+++ b/test/webhelper/testTranslate2.js
@@ -150,6 +150,11 @@ describe('WebHelper2 translator', function () {
             run_loop('<p name="translatable">String with "quotes"</p>');
             expect(gettext_spy).toHaveBeenCalledWith('String with "quotes"');
         });
+
+        it('handles embedded tags correctly', function () {
+            run_loop('<p name="translatable">Embedded<br><b>tags</b></p>');
+            expect(gettext_spy).toHaveBeenCalledWith('Embedded<br><b>tags</b>');
+        });
     });
 
     describe('used from client-side Javascript', function () {
diff --git a/tools/eos-html-extractor b/tools/eos-html-extractor
index c7f87cb..bf50e98 100755
--- a/tools/eos-html-extractor
+++ b/tools/eos-html-extractor
@@ -7,7 +7,6 @@ import io
 import os.path
 import re
 import sys
-from bs4 import BeautifulSoup
 from html.parser import HTMLParser
 
 ESCAPES = str.maketrans({
@@ -20,14 +19,41 @@ def normalize_string(string):
 
 # Parser that adds line numbers to the HTML strings that need translating
 class TranslatableHTMLParser(HTMLParser):
-    def __init__(self, translatable_strings):
+    def __init__(self):
         super().__init__()
         self.all_translatable_data = []
         self._comments_with_line_numbers = []
-        self._translatable_strings = set(translatable_strings)
+        self._current_translatable_tag_level = None
+        self._opened_tags = []
+        self._tag_level = 0
+        self._text = ''
 
-    def handle_data(self, data):
-        if data not in self._translatable_strings:
+    def handle_starttag(self, tag, attrs):
+        self._tag_level += 1
+        self._opened_tags.append(tag)
+
+        if self._current_translatable_tag_level is not None:
+            self._text += self.get_starttag_text()
+            return
+
+        if ('name', 'translatable') in attrs:
+            self._current_translatable_tag_level = self._tag_level
+
+    def handle_endtag(self, tag):
+        self._tag_level -= 1
+        # In non-X HTML, there can be tags that don't close, e.g. <meta>, <br>
+        while self._opened_tags.pop() != tag:
+            self._tag_level -= 1
+
+        if (self._current_translatable_tag_level is not None and
+            self._current_translatable_tag_level >= self._tag_level + 1):
+            self._current_translatable_tag_level = None
+
+        if self._current_translatable_tag_level is not None:
+            self._text += '</' + tag + '>'
+            return
+
+        if not self._text:
             return
 
         code_line = self.getpos()[0]
@@ -45,7 +71,13 @@ class TranslatableHTMLParser(HTMLParser):
             if comment_line + comment_length == code_line:
                 optional_comment = ' '.join(comment_string.split())
 
-        self.all_translatable_data.append((normalize_string(data), code_line, optional_comment))
+        self.all_translatable_data.append((normalize_string(self._text), code_line, optional_comment))
+        self._text = ''
+
+    def handle_data(self, data):
+        if self._current_translatable_tag_level is None:
+            return
+        self._text += data
 
     def handle_comment(self, comment):
         self._comments_with_line_numbers.append((comment, self.getpos()[0]))
@@ -70,17 +102,12 @@ top_dir = args.top_srcdir
 final_path = os.path.relpath(html_file, top_dir)
 out_file = args.output
 
-# Create the BeautifulSoup HTML-parsing object
 with open(html_file, encoding='utf-8') as f:
     page = f.read()
-soup = BeautifulSoup(page)
-
-# Extract all translatable strings from that HTML
-translatable_divs = soup.find_all(attrs={'name': 'translatable'})
-translatable_strings = map(lambda div: div.contents[0], translatable_divs)
 
-# Find the line numbers for those strings
-parser = TranslatableHTMLParser(translatable_strings)
+# Extract all translatable strings from the HTML and find the line numbers for
+# those strings
+parser = TranslatableHTMLParser()
 parser.feed(page)
 
 # Write out all info about the translatable strings found in this file