diff options
Diffstat (limited to 'lib/dicts/cmu/cmulex.scm')
-rw-r--r-- | lib/dicts/cmu/cmulex.scm | 275 |
1 files changed, 250 insertions, 25 deletions
diff --git a/lib/dicts/cmu/cmulex.scm b/lib/dicts/cmu/cmulex.scm index 933b9b0..cfb99b3 100644 --- a/lib/dicts/cmu/cmulex.scm +++ b/lib/dicts/cmu/cmulex.scm @@ -38,6 +38,8 @@ (require 'pos) +(set! cmulex_version "2.0 (from 0.4) July 2008") + (define (cmu_lts_function word feats) "(cmu_lts_function word feats) Function called for CMULEX when word is not found in lexicon. Uses @@ -51,39 +53,81 @@ prediction rules." (if (string-matches dcword "[a-z]*") (begin (set! phones (lts_predict dcword cmu_lts_rules)) - (set! syls (lex.syllabify.phstress phones)) + (set! syls (cmulex_mosyl_phstress phones)) ) (set! syls nil)) (list word nil syls))) - (define (cmulex_addenda) "(cmulex_addenda) Add entries to the current lexicon (radio/darpa). These are basically words that are not in the CMU lexicon." - (lex.add.entry - '("t" n (((t iy) 1)))) - (lex.add.entry - '("I'll" v (((ay l) 1)))) - (lex.add.entry - '("I'd" v (((ay d) 1)))) - (lex.add.entry - '("I'm" v (((ay m) 1)))) - (lex.add.entry - '("uk" n (((y uw) 1) ((k ey) 1)))) - (lex.add.entry - '("w" n (((d ah b) 1) ((ah l) 0) ((y uw) 1)))) - (lex.add.entry - '("'s" pos (((ax z) 0)))) - (lex.add.entry - '("bought" v (((b ao t) 1)))) - (lex.add.entry - '("edinburgh" n (((eh d) 1) ((ah n) 0) ((b ax ) 0) ((r ow) 0)))) - (lex.add.entry - '("non-intoxicating" () - (((n aa n) 1) ((ih n t) 0) ((aa k) 1) ((s ih k) 0) ((ey t) 1) ((ih ng) 0)))) - (lex.add.entry - '("AT&T" n (((ey) 1) ((t iy) 1) ((ah n d) 0) ((t iy) 1)))) + (lex.add.entry '("worf" n (((w ao r f) 1)))) + (lex.add.entry '("t" n (((t iy) 1)))) + (lex.add.entry '("I'll" v (((ay l) 1)))) + (lex.add.entry '("it's" v (((ih t s) 1)))) + (lex.add.entry '("don't" v (((d ow n t) 1)))) + (lex.add.entry '("didn't" v (((d ih d n t) 1)))) + (lex.add.entry '("isn't" v (((ih z n t) 1)))) + (lex.add.entry '("doesn't" v (((d ah z n t) 1)))) + (lex.add.entry '("that's" v (((dh ae t s) 1)))) + (lex.add.entry '("won't" v (((w ow n t) 1)))) + (lex.add.entry '("aren't" v (((ae r n t) 1)))) + (lex.add.entry '("there's" v (((dh er z) 1)))) + (lex.add.entry '("we're" v (((w iy r) 1)))) + (lex.add.entry '("wouldn't" v (((w uh d n t) 1)))) + (lex.add.entry '("wasn't" v (((w aa z n t) 1)))) + (lex.add.entry '("they're" v (((dh er) 1)))) + (lex.add.entry '("weren't" v (((w er n t) 1)))) + (lex.add.entry '("i'm" v (((ay m) 1)))) + (lex.add.entry '("he's" v (((hh iy z) 1)))) + (lex.add.entry '("you're" v (((y uw r) 1)))) + (lex.add.entry '("haven't" v (((hh ae v n t) 1)))) + (lex.add.entry '("we've" v (((w iy v) 1)))) + (lex.add.entry '("i've" v (((ay v) 1)))) + (lex.add.entry '("hadn't" v (((hh ae d n t) 1)))) + (lex.add.entry '("they've" v (((dh ey v) 1)))) + (lex.add.entry '("shouldn't" v (((sh uh d n t) 1)))) + (lex.add.entry '("I'd" v (((ay d) 1)))) + (lex.add.entry '("they'll" v (((dh ey l) 1)))) + (lex.add.entry '("you've" v (((y uw v) 1)))) + (lex.add.entry '("you'll" v (((y uw l) 1)))) + (lex.add.entry '("I'll" v (((ay l) 1)))) + (lex.add.entry '("we'd" v (((w iy d) 1)))) + (lex.add.entry '("he'd" v (((hh iy d) 1)))) + (lex.add.entry '("he'll" v (((hh iy l) 1)))) + (lex.add.entry '("they'd" v (((dh ey d) 1)))) + (lex.add.entry '("you'd" v (((y uw d) 1)))) + (lex.add.entry '("it'll" v (((ih t) 1) ((ah l) 0)))) + (lex.add.entry '("who've" v (((hh uw v) 1)))) + (lex.add.entry '("ain't" v (((ey n t) 1)))) + (lex.add.entry '("needn't" v (((n iy d n t) 1)))) + (lex.add.entry '("she'd" v (((sh iy d) 1)))) + (lex.add.entry '("who'd" v (((hh uw d) 1)))) + (lex.add.entry '("she'll" v (((sh iy l) 1)))) + (lex.add.entry '("there'll" v (((dh er l) 1)))) + (lex.add.entry '("there'd" v (((dh er d) 1)))) + (lex.add.entry '("it'd" v (((ih t) 1) ((ah d) 0)))) + (lex.add.entry '("who'll" v (((hh uw l) 1)))) + (lex.add.entry '("that'll" v (((dh ae t l) 1)))) + (lex.add.entry '("mightn't" v (((m ay t n t) 1)))) + (lex.add.entry '("would've" v (((w uh d) 1) ((ah v) 0)))) + (lex.add.entry '("mustn't" v (((m ah s n t) 1)))) + (lex.add.entry '("how'd" v (((hh ow d) 1)))) + (lex.add.entry '("could've" v (((k uh d) 1) ((ah v) 0)))) + + (lex.add.entry '("hasn't" v (((hh ae z n t) 1)))) + (lex.add.entry '("couldn't" v (((k uh d n t) 1)))) + (lex.add.entry '("can't" v (((k ae n t) 1)))) + (lex.add.entry '("we'll" v (((w iy l) 1)))) + + (lex.add.entry '("uk" n (((y uw) 1) ((k ey) 1)))) + (lex.add.entry '("w" n (((d ah b) 1) ((ah l) 0) ((y uw) 1)))) + (lex.add.entry '("'s" pos (((ax z) 0)))) + (lex.add.entry '("bought" v (((b ao t) 1)))) + (lex.add.entry '("edinburgh" n (((eh d) 1) ((ah n) 0) ((b ax ) 0) ((r ow) 0)))) + (lex.add.entry '("non-intoxicating" () (((n aa n) 1) ((ih n t) 0) ((aa k) 1) ((s ih k) 0) ((ey t) 1) ((ih ng) 0)))) + (lex.add.entry '("AT&T" n (((ey) 1) ((t iy) 1) ((ah n d) 0) ((t iy) 1)))) (lex.add.entry '("cepstra" n (((k eh p) 1) ((s t r aa) 0)))) (lex.add.entry @@ -215,6 +259,187 @@ words that are not in the CMU lexicon." (((b iy) 0) ((f ao r) 1)))) ) +(define (cmulex_map_sylstructure syls) + (unwind-protect + (begin + (mapcar + (lambda (s1 s2) + (list s1 (cadr s2))) + (cmulex_syllabify_maxonset + (apply + append + (mapcar car syls))) + syls)) + (begin + (format t "Failed to resyllabify %l\n" syls) + syls)) +) + +(define (cmulex_mosyl_phstress phones) + (set! xxx (mapcar + (lambda (syl) + (set! stress 0) + (list + (mapcar + (lambda (p) + (cond + ((string-matches p "[aeiou@].*1") + (set! stress 1) + (intern (substring p 0 (- (length p) 1)))) + ((string-matches p "[aeiou@].*0") + (set! stress 0) + (intern (substring p 0 (- (length p) 1)))) + (t + (intern p)))) + (mapcar string-append syl)) + stress)) + (cmulex_syllabify_maxonset phones))) +; (format t "%l\n%l\n" phones xxx) + xxx + +) + +(define (cmulex_syllabify_maxonset phones) + (cmulex_syllabify_maxonset2 + (mapcar intern (reverse phones)) + nil nil) +) + +(define (cmulex_syllabify_maxonset2 phones syl syls) + "(cmulex_syllabify_maxonset phones) +Syllabify by maximum onset. phones is given in reverse order" +; (format t "csm2 phones %l syl %l syls %l\n" +; phones syl syls) + (cond + ((null phones) + (if syl + (cons (reverse syl) syls) + syls)) + ((null (cmulex_has_vowel phones)) ;; safety case + ;; could some weird onset we've never seen before + (cons + (append (reverse phones) syl) + syls)) + ((null (string-matches (car phones) "[aeiou@].*")) ;; a vowel + (cmulex_syllabify_maxonset2 + (cdr phones) + (cons (car phones) syl) + syls)) + (t ;; is a vowel + (let ((onset (cmulex_maxonset (cdr phones)))) + (cmulex_syllabify_maxonset2 + (nth_cdr (+ 1 (length onset)) phones) + nil + (cons (append onset (list (car phones)) syl) syls)))))) + +(define (cmulex_has_vowel p) + (cond + ((null p) nil) + ((string-matches (car p) "[aeiou@].*") ;; a vowel + t) + (t + (cmulex_has_vowel (cdr p))))) + +(define (cmulex_maxonset phones) + (cond + ((string-matches (car phones) "[aeiou@].*") + nil) + ((string-equal (car phones) "ng") ;; only non-syl-initial phone + nil) + ((null phones) nil) + ((and (> (length phones) 2) + (member (list (car (cddr phones)) + (cadr phones) + (car phones) + ) + cmulex_tri_onsets)) + (list (car (cddr phones)) + (cadr phones) + (car phones) + )) + ((and (> (length phones) 1) + (member (list (cadr phones) + (car phones)) + cmulex_di_onsets)) + (list (cadr phones) + (car phones))) + (t + (list (car phones))))) + +(set! cmulex_tri_onsets + '( + (s t r) + (s p y) + (s p r) + (s p l) + (s k y) + (s k w) + (s k r) + (s k l) + )) + +(set! cmulex_di_onsets +'( + (z w) + (z l) + (v y) + (v r) + (v l) + (th w) + (th r) + (t y) + (t w) +; (t s) + (t r) + (sh w) + (sh r) + (sh n) + (sh m) + (sh l) + (s w) + (s v) + (s t) + (s r) + (s p) + (s n) + (s m) + (s l) + (s k) + (s f) + (p y) + (p w) + (p r) + (p l) + (n y) + (m y) + (m r) + (l y) + (k y) + (k w) + (k r) + (k l) + (hh y) + (hh w) + (hh r) + (hh l) + (g y) + (g w) + (g r) + (g l) + (f y) + (f r) + (f l) + (d y) + (d w) + (d r) + (b y) + (b w) + (b r) + (b l) +)) + +(set! lex_syllabification (list cmulex_mosyl_phstress)) + (lex.create "cmu") (lex.set.compile.file (path-append cmulexdir "cmudict-0.4.out")) (lex.set.phoneset "radio") |