summaryrefslogtreecommitdiff
path: root/lib/voices/finnish/hy_fi_mv_diphone/festvox/suopuhe_filter.perl
blob: 1e12d68bb96e42b5abde8cd45f58f5a2dfa4372d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/perl -w
use strict;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;                                                                       ;;
#;;          Department of General Linguistics / Suopuhe project          ;;
#;;                      University of Helsinki, FI                       ;;
#;;                  Copyright (c) 2000,2001,2002,2003                      ;;
#;;                        All Rights Reserved.                           ;;
#;;                                                                       ;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;


#; This program is distributed under Gnu Lesser General Public License (cf. the
#; file LICENSE in distribution).
 
#; This program is free software; you can redistribute it and/or modify
#; it under the terms of the GNU Lesser General Public License as published by
#; the Free Software Foundation; either version 2 of the License, or
#; (at your option) any later version.
 
#; This program is distributed in the hope that it will be useful,
#; but WITHOUT ANY WARRANTY; without even the implied warranty of
#; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#; GNU Lesser General Public License for more details.

# suopuheen BASH-filtteri
# sy�tteen tulisi olla lause/rivi -muotoista 

# Fri Oct 26 12:04:40 EEST 2001 
# -Added partial sayas-replacement already here, because of Festival
#  (rxp-parser?) bug.

undef $/;

$_ = <>;

# kommentit pois 
s/<!\-\-.*?\-\->//gs;
s/\s+/ /gs;

s/ original=\"<\"//g; # XML-parseri bugaa t�h�n
s/ original=\"\&\"//g;

#> yksi kaksi <# muuttuu muotoon #> yksi-kaksi <# 
# (korjaa raa'asti festarin token-mokan):
while ( s/> ([A-Za-z���\-]+) ([A-Za-z���])/> $1-$2/ ) {} 
# tyhj�t pois:
s/<token( +[a-z]+=\"[^\"]*\")*> +<\/token>\s*//;
s/> />\n/g;
s/ </\n</g;

print $_;

#my $status = 1;
#while ( <> ) {
#    if ( /<utterance>/ ) { $status = 1; }
#    # skipataan aloittavat puncit
#    # voisi olla parempi ehk� lukea ne...
#    elsif ( /<token pos=\"punc\">/ ) { next; }
#    else { $status = 0; }
#    # delete comments...
#    s/<!\-\-.*?\-\->//g;
#
#    s/ original=\"<\"//g; # XML-parseri bugaa t�h�n
#    s/ original=\"\&\"//g;
#    
#    # #> yksi kaksi <# muuttuu muotoon #> yksikaksi <# 
#    while ( s/> ([A-Za-z���]+) ([A-Za-z���])/> $1$2/ ) {} 
#    
#    # tyhj�t pois... (toimiikohan)
#    s/<token( +[a-z]+=\"[^\"]*\")*> +<\/token>//;     
#    
#
#    
#    print;
#}