summaryrefslogtreecommitdiff
path: root/tools/eos-html-extractor
blob: bf50e98f83973c5af66f01f17994ce4a05db6be1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3

# Copyright 2013-2015 Endless Mobile, Inc.

import argparse
import io
import os.path
import re
import sys
from html.parser import HTMLParser

ESCAPES = str.maketrans({
    '"': '\\"',
})

def normalize_string(string):
    return re.sub(r'\s+', ' ', string.strip())


# Parser that adds line numbers to the HTML strings that need translating
class TranslatableHTMLParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.all_translatable_data = []
        self._comments_with_line_numbers = []
        self._current_translatable_tag_level = None
        self._opened_tags = []
        self._tag_level = 0
        self._text = ''

    def handle_starttag(self, tag, attrs):
        self._tag_level += 1
        self._opened_tags.append(tag)

        if self._current_translatable_tag_level is not None:
            self._text += self.get_starttag_text()
            return

        if ('name', 'translatable') in attrs:
            self._current_translatable_tag_level = self._tag_level

    def handle_endtag(self, tag):
        self._tag_level -= 1
        # In non-X HTML, there can be tags that don't close, e.g. <meta>, <br>
        while self._opened_tags.pop() != tag:
            self._tag_level -= 1

        if (self._current_translatable_tag_level is not None and
            self._current_translatable_tag_level >= self._tag_level + 1):
            self._current_translatable_tag_level = None

        if self._current_translatable_tag_level is not None:
            self._text += '</' + tag + '>'
            return

        if not self._text:
            return

        code_line = self.getpos()[0]
        optional_comment = None

        if self._comments_with_line_numbers:
            # Determine if comment should be included
            most_recent_comment = self._comments_with_line_numbers[-1]
            comment_string, comment_line = most_recent_comment

            # Comment takes up at least one line by default (hence the +1)
            comment_length = len(re.findall(r'\n', comment_string)) + 1

            # If the comment immediately preceded this string, include it
            if comment_line + comment_length == code_line:
                optional_comment = ' '.join(comment_string.split())

        self.all_translatable_data.append((normalize_string(self._text), code_line, optional_comment))
        self._text = ''

    def handle_data(self, data):
        if self._current_translatable_tag_level is None:
            return
        self._text += data

    def handle_comment(self, comment):
        self._comments_with_line_numbers.append((comment, self.getpos()[0]))

# Ensure stdout is UTF-8
default_out = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')

parser = argparse.ArgumentParser(description='Extract translatable strings ' +
    'from HTML files. This is xgettext for HTML.')
parser.add_argument('input_file', type=str,
    help='Input file to scan')
parser.add_argument('top_srcdir', type=str, nargs='?', default='.',
    help='Top-level source directory (for printing correct #line directives)')
parser.add_argument('-o', '--output', default=default_out,
    type=argparse.FileType('w', encoding='utf-8'),
    help='File to write (default: stdout)')
args = parser.parse_args()

# Path from current directory to top-level app directory
html_file = args.input_file
top_dir = args.top_srcdir
final_path = os.path.relpath(html_file, top_dir)
out_file = args.output

with open(html_file, encoding='utf-8') as f:
    page = f.read()

# Extract all translatable strings from the HTML and find the line numbers for
# those strings
parser = TranslatableHTMLParser()
parser.feed(page)

# Write out all info about the translatable strings found in this file
for string, line_num, optional_comment in parser.all_translatable_data:
    out_file.write('#line {line} "{path}"\n'.format(line=line_num, path=final_path))
    if optional_comment:
        out_file.write('// {}\n'.format(optional_comment))
    out_file.write('_("{string}");\n'.format(string=string.translate(ESCAPES)))