tools/eos-html-extractor


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

#!/usr/bin/env python
'''
Created on July 19, 2013

@author: Sebastian
'''
# This scraper depends on the BeautifulSoup4 module, make sure
# it's installed by running the following:
# apt-get install python-bs4
import os, re, sys, urllib
from bs4 import BeautifulSoup
from HTMLParser import HTMLParser

# Parser that adds line numbers to the HTML strings that need translating
class TranslatableHTMLParser(HTMLParser):
    def handle_data(self, data):
        if data in translatable_strings:
            # Determine if comment should be included
            most_recent_comment = comments_with_line_numbers[(len(comments_with_line_numbers))-1]
            comment_string = most_recent_comment[0]
            comment_line = most_recent_comment[1]
            code_line = HTMLParser.getpos(self)[0]
            # Comment takes up at least one line by default (hence the +1)
            comment_length = len(re.findall(r"\n", comment_string)) + 1
            optional_comment = ""
            # If the comment immediately preceded this string, include it
            if comment_line + comment_length == code_line:
                optional_comment = " ".join(comment_string.split())
            all_translatable_data.append((data.strip(), code_line, optional_comment))

    def handle_comment(self, comment):
        comments_with_line_numbers.append((comment, HTMLParser.getpos(self)[0]))

# Ensure proper usage
if len(sys.argv) != 3:
    print("Usage:")
    print("  html_extractor.py <input-file> <top-srcdir>")
    sys.exit(1)

# Path from current directory to top-level app directory
html_file = sys.argv[1]
top_dir = sys.argv[2]
final_path = os.path.relpath(html_file, top_dir)

# Create the BeautifulSoup HTML-parsing object
page = urllib.urlopen(urllib.pathname2url(html_file)).read()
soup = BeautifulSoup(page)

# Extract all translatable strings from that HTML
translatable_divs = soup.find_all(attrs={"name" : "translatable"})
translatable_strings = map(lambda div: div.contents[0].encode('utf-8'), translatable_divs)

# Find the line numbers for those strings
all_translatable_data = []
comments_with_line_numbers = []
parser = TranslatableHTMLParser()
parser.feed(page)

# Write out all info about the translatable strings found in this file
for string, line_num, optional_comment in all_translatable_data:
    print ("#line " + str(line_num) + " \"" + final_path + "\"")
    if optional_comment != "":
        print ("// " + optional_comment)
    print ("_(\"" + string + "\");")