Add eos-html-extractor and m4 file

This is taken almost directly from the existing version in eos-english. Cleanups to follow in subsequent commits. Previously the m4 code was in two separate macros, but since they were much the same, I combined them into one macro. This also adds a very minimal test for eos-html-extractor; basically as a very quick regression test for the cleanups to follow. [endlessm/eos-sdk#3245]
author: Philip Chimento <philip@endlessm.com> 2015-06-04 09:20:31 -0700
committer: Philip Chimento <philip@endlessm.com> 2015-06-04 14:27:31 -0700
commit: f17a6ff5c41215701b822ccc4d46e89832fbd033 (patch)
tree: 32e80569771061e8948903002c694dafc3514ea1 /tools/eos-html-extractor
parent: fbc49cb284067838416c6022d5c7dcb64899e030 (diff)
1 files changed, 64 insertions, 0 deletions
diff --git a/tools/eos-html-extractor b/tools/eos-html-extractor
new file mode 100755
index 0000000..c17d131
--- /dev/null
+++ b/tools/eos-html-extractor
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+'''
+Created on July 19, 2013
+
+@author: Sebastian
+'''
+# This scraper depends on the BeautifulSoup4 module, make sure
+# it's installed by running the following:
+# apt-get install python-bs4
+import os, re, sys, urllib
+from bs4 import BeautifulSoup
+from HTMLParser import HTMLParser
+
+# Parser that adds line numbers to the HTML strings that need translating
+class TranslatableHTMLParser(HTMLParser):
+    def handle_data(self, data):
+        if data in translatable_strings:
+            # Determine if comment should be included
+            most_recent_comment = comments_with_line_numbers[(len(comments_with_line_numbers))-1]
+            comment_string = most_recent_comment[0]
+            comment_line = most_recent_comment[1]
+            code_line = HTMLParser.getpos(self)[0]
+            # Comment takes up at least one line by default (hence the +1)
+            comment_length = len(re.findall(r"\n", comment_string)) + 1
+            optional_comment = ""
+            # If the comment immediately preceded this string, include it
+            if comment_line + comment_length == code_line:
+                optional_comment = " ".join(comment_string.split())
+            all_translatable_data.append((data.strip(), code_line, optional_comment))
+
+    def handle_comment(self, comment):
+        comments_with_line_numbers.append((comment, HTMLParser.getpos(self)[0]))
+
+# Ensure proper usage
+if len(sys.argv) != 3:
+    print("Usage:")
+    print("  html_extractor.py <input-file> <top-srcdir>")
+    sys.exit(1)
+
+# Path from current directory to top-level app directory
+html_file = sys.argv[1]
+top_dir = sys.argv[2]
+final_path = os.path.relpath(html_file, top_dir)
+
+# Create the BeautifulSoup HTML-parsing object
+page = urllib.urlopen(urllib.pathname2url(html_file)).read()
+soup = BeautifulSoup(page)
+
+# Extract all translatable strings from that HTML
+translatable_divs = soup.find_all(attrs={"name" : "translatable"})
+translatable_strings = map(lambda div: div.contents[0].encode('utf-8'), translatable_divs)
+
+# Find the line numbers for those strings
+all_translatable_data = []
+comments_with_line_numbers = []
+parser = TranslatableHTMLParser()
+parser.feed(page)
+
+# Write out all info about the translatable strings found in this file
+for string, line_num, optional_comment in all_translatable_data:
+    print ("#line " + str(line_num) + " \"" + final_path + "\"")
+    if optional_comment != "":
+        print ("// " + optional_comment)
+    print ("_(\"" + string + "\");")
author	Philip Chimento <philip@endlessm.com>	2015-06-04 09:20:31 -0700
committer	Philip Chimento <philip@endlessm.com>	2015-06-04 14:27:31 -0700
commit	f17a6ff5c41215701b822ccc4d46e89832fbd033 (patch)
tree	32e80569771061e8948903002c694dafc3514ea1 /tools/eos-html-extractor
parent	fbc49cb284067838416c6022d5c7dcb64899e030 (diff)