blob: db0912ad74e28800aa7b8e484cf185641e3c5e97 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
#!/usr/bin/perl -w
use strict;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;; ;;
#;; Department of General Linguistics / Suopuhe project ;;
#;; University of Helsinki, FI ;;
#;; Copyright (c) 2000,2001,2002,2003 ;;
#;; All Rights Reserved. ;;
#;; ;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#; This program is distributed under Gnu Lesser General Public License (cf. the
#; file LICENSE in distribution).
#; This program is free software; you can redistribute it and/or modify
#; it under the terms of the GNU Lesser General Public License as published by
#; the Free Software Foundation; either version 2 of the License, or
#; (at your option) any later version.
#; This program is distributed in the hope that it will be useful,
#; but WITHOUT ANY WARRANTY; without even the implied warranty of
#; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#; GNU Lesser General Public License for more details.
# suopuheen BASH-filtteri
# sy�tteen tulisi olla lause/rivi -muotoista
# Tue Sep 9 11:23:05 EEST 2003
# -Added of forgotten 'g': s/// => s///g which caused list intonation to
# remove wrong words
# Fri Oct 26 12:04:40 EEST 2001
# -Added partial sayas-replacement already here, because of Festival
# (rxp-parser?) bug.
# read the whole input into $_
undef $/;
$_ = <>;
# kommentit pois
s/<!\-\-.*?\-\->//gs;
s/\s+/ /gs;
# XML-parserin bugeja:
s/ original=\"<\"//g;
s/ original=\"\&\"//g;
#> yksi kaksi <# muuttuu muotoon #> yksi-kaksi <#
# (korjaa raa'asti festarin token-mokan):
while ( s/> ([A-Za-z���\-]+) ([A-Za-z���])/> $1-$2/ ) {}
# tyhj�t pois:
s/<token( +[a-z]+=\"[^\"]*\")*> +<\/token>\s*//g;
# allow phrase and break only in mid positon of token
while ( s/(<break\/>)\s*(<\/token>)/$2 $1/ ||
s/(<phrase\/>)\s*(<\/token>)/$2 $1/ ) {}
while ( s/(<token( +[a-z]+=\"[^\"]*\")*>)\s+(<(break|phrase)\/>)/$3 $1/g ) {}
# eliminate <break/> <phrase\/> sequences: the first one wins
s/(<(break|phrase)\/>)( <(break|phrase)\/>)*/$1/g;
s/> />\n/g;
s/ </\n</g;
print $_;
|