summaryrefslogtreecommitdiff
path: root/lib/voices/finnish/suo_fi_lj_diphone/festvox/suopuhe_filter.perl
blob: db0912ad74e28800aa7b8e484cf185641e3c5e97 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/perl -w
use strict;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;                                                                       ;;
#;;          Department of General Linguistics / Suopuhe project          ;;
#;;                      University of Helsinki, FI                       ;;
#;;                  Copyright (c) 2000,2001,2002,2003                      ;;
#;;                        All Rights Reserved.                           ;;
#;;                                                                       ;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


#; This program is distributed under Gnu Lesser General Public License (cf. the
#; file LICENSE in distribution).
 
#; This program is free software; you can redistribute it and/or modify
#; it under the terms of the GNU Lesser General Public License as published by
#; the Free Software Foundation; either version 2 of the License, or
#; (at your option) any later version.
 
#; This program is distributed in the hope that it will be useful,
#; but WITHOUT ANY WARRANTY; without even the implied warranty of
#; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#; GNU Lesser General Public License for more details.

# suopuheen BASH-filtteri
# sy�tteen tulisi olla lause/rivi -muotoista 

# Tue Sep  9 11:23:05 EEST 2003
# -Added of forgotten 'g': s/// => s///g which caused list intonation to
# remove wrong words

# Fri Oct 26 12:04:40 EEST 2001 
# -Added partial sayas-replacement already here, because of Festival
#  (rxp-parser?) bug.

# read the whole input into $_
undef $/;
$_ = <>;

# kommentit pois 
s/<!\-\-.*?\-\->//gs;
s/\s+/ /gs;

# XML-parserin bugeja:
s/ original=\"<\"//g;
s/ original=\"\&\"//g;

#> yksi kaksi <# muuttuu muotoon #> yksi-kaksi <# 
# (korjaa raa'asti festarin token-mokan):
while ( s/> ([A-Za-z���\-]+) ([A-Za-z���])/> $1-$2/ ) {} 
# tyhj�t pois:
s/<token( +[a-z]+=\"[^\"]*\")*> +<\/token>\s*//g;


# allow phrase and break only in mid positon of token
while ( s/(<break\/>)\s*(<\/token>)/$2 $1/ ||
	s/(<phrase\/>)\s*(<\/token>)/$2 $1/ ) {}

while ( s/(<token( +[a-z]+=\"[^\"]*\")*>)\s+(<(break|phrase)\/>)/$3 $1/g ) {}

# eliminate <break/> <phrase\/> sequences: the first one wins

s/(<(break|phrase)\/>)( <(break|phrase)\/>)*/$1/g;






s/> />\n/g;
s/ </\n</g;

print $_;