# ============================================================= # CONTENT: Sample Finite-State Tokenizer (no multi-words) # AUTHOR: Original version by Anne Schiller, copied from the # Karttunen/Beesley book on finite state grammar # Modified by Trond Trosterud, 2002. # CREATED: 12-Jun-1997 # UPDATED: 05-Sep-2001 # ============================================================= # Usage: xfst -f [ThisFile] # ============================================================= clear stack echo >>> define white space define SP " "; define TAB "\t"; define NL "\n"; define WS [SP|NL|TAB]; # ============================================================= echo >>> define single character symbols define SINGLE [ %" | %. | %, | %; | %: | %! | %? | %( | %) | %[ | %] | %{ | %} | %« | %» ]; # " for the colouring only. define PUNCT [ %. %. (%.) | %' %' | %' %' | %, %, ] ; define Char \[ WS | SINGLE ] ; # ============================================================= echo >>> define SYMBOL define SYMBOL [ SINGLE | PUNCT ] ; echo >>> define WORD define WORD [ Char ]+ ; # ============================================================= #echo >>> list of abbreviations #define ABBR [ #Mr. | Mrs. | Ms. #| etc. | e.g. | i.e. #| ltd. | Ltd. | inc. | Inc. #]; # ============================================================= echo >>> regular abbreviations define Letter [A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z| Å|Ä|Ö|Ø|Æ|Á|É|Ó|Ú|Í|À|È|Ò|Ù|Ì|Ë|Ü|Ï|Â|Ê|Ô|Û|Î|Ã|Ý|þ|Ñ|Ð| a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z å|ä|ö|ø|æ|á|é|ó|ú|í|à|è|ò|ù|ì|ë|ü|ï|â|ê|ô|û|î|ã|ý|þ|ñ|ð|ç|ß|']; define INIT [ Letter %. ]+ ; # ============================================================= echo >>> numeric expressions define Digit [ %0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9]; define NumOp [ %- | %+ | %* | %/ | %= | %: ]; define NumSep [ %. | %, ]; define NUM [ Digit | NumOp | NumSep]+ & $[Digit] ; # ============================================================= echo >>> some multi-words # enclosed by {} since the letters shall be read as separate symbols define MWE [ {keffrys ha} | {kekeffrys ha} | {kehys ha} | {bys pan} | {kyns es} | {kepar ha pan} ] ; # marker for multi-words: define M1 "<<" ; define M2 ">>" ; define MWE1 [M1 MWE M2]; # ============================================================= echo >>> define tokens define Token [ WORD | SYMBOL | ABBR | INIT | NUM | MWE1 ]; # ============================================================= echo >>> longest match define Bound [ SINGLE | WS | .#. ] ; define TOK1 [ MWE @-> M1 ... M2 || Bound _ Bound .o. Token @-> ... NL .o. [M1|M2] -> 0 ]; echo >>> normalize space define TOK0 [ WS+ @-> SP ]; echo >>> remove spaces define WS1 [WS]+ & $[NL] ; define TOK2 [ WS1 @-> NL ] ; # ========================================================= echo >>> compose read regex [TOK0 .o. TOK1 .o. TOK2 ]; # absorb the special brackets into the unknown alphabet substitute symbol ? for "<<" substitute symbol ? for ">>" # echo >>> longest match--insert a newline after each token # define TOK1 [ Token @-> ... NL ] ; # echo >>> map spaces to a newline # define TOK2 [ [WS]+ @-> NL ]; # # # ============================================================= # echo >>> compose # read regex [TOK1 .o. TOK2 ]; invert net #save stack tok.fst