# ============================================================= # CONTENT: Sample Finite-State Tokenizer (no multi-words) # AUTHOR: Original version by Anne Schiller, copied from the # Karttunen/Beesley book on finite state grammar # Modified by Trond Trosterud, 2002. # CREATED: 12-Jun-1997 # UPDATED: 05-Sep-2001 # ============================================================= # Usage: xfst -f [ThisFile] # ============================================================= clear stack echo >>> define white space define SP " "; define TAB "\t"; define NL "\n"; define WS [SP|NL|TAB]; # ============================================================= echo >>> define single character symbols define SINGLE [ %" | %. | %, | %; | %: | %! | %? | %( | %) | %[ | %] | %{ | %} | %´ | %ª ]; define PUNCT [ %. %. (%.) | %' %' | %' %' | %, %, ] ; define Char \[ WS | SINGLE ] ; # ============================================================= echo >>> define SYMBOL define SYMBOL [ SINGLE | PUNCT ] ; echo >>> define WORD define WORD [ Char ]+ ; # ============================================================= #echo >>> list of abbreviations #define ABBR [ #Mr. | Mrs. | Ms. #| etc. | e.g. | i.e. #| ltd. | Ltd. | inc. | Inc. #]; # ============================================================= echo >>> regular abbreviations define Letter [ а | и | і | о | ӧ | у | ы | э | А | И | І | О | Ӧ | У | Ы | Э | б | в | г | д | ж | з | Б | В | Г | Д | Ж | З | й | к | л | м | н | п | р | с | т | ф | х | ц | Й | К | Л | М | Н | П | Р | С | Т | Ф | Х | Ц | ч | ш | щ | ъ | ь | Ч | Ш | Щ | Ъ | Ь | е | ё | ю | я | Е | Ё | Ю | Я ]; define INIT [ Letter %. ]+ ; # ============================================================= echo >>> numeric expressions define Digit [ %0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9]; define NumOp [ %- | %+ | %* | %/ | %= | %: ]; define NumSep [ %. | %, ]; define NUM [ Digit | NumOp | NumSep]+ & $[Digit] ; # ============================================================= echo >>> define tokens define Token [ WORD | SYMBOL | INIT ]; # commenting out NUM in order to allow 7-bit encoding # commenting them in again since this is Lule S·mi define Token [ WORD | SYMBOL | ABBR | INIT | NUM ]; # ============================================================= echo >>> longest match--insert a newline after each token define TOK1 [ Token @-> ... NL ] ; echo >>> map spaces to a newline define TOK2 [ [WS]+ @-> NL ]; # ============================================================= echo >>> compose read regex [TOK1 .o. TOK2 ]; invert net #save stack tok.fst