# =============================================================
# CONTENT: Sample Finite-State Tokenizer (no multi-words)
# AUTHOR: Original version by Anne Schiller, copied from the
# Karttunen/Beesley book on finite state grammar
# Modified by Trond Trosterud, 2002.
# CREATED: 12-Jun-1997
# UPDATED: 05-Sep-2001
# =============================================================
# Usage: xfst -f [ThisFile]
# =============================================================

clear stack
echo >>> define white space
define SP " ";
define TAB "\t";
define NL "\n";

define WS [SP|NL|TAB];

# =============================================================
echo >>> define single character symbols
define SINGLE [ %" | %. | %, | %; | %: | %! | %?
| %( | %) | %[ | %] | %{ | %} | %« | %» 
];
# " for the colouring only.
define PUNCT [ %. %. (%.) | %' %' | %' %' | %, %, ] ;
define Char \[ WS | SINGLE ] ;

# =============================================================
echo >>> define SYMBOL
define SYMBOL [ SINGLE | PUNCT ] ;

echo >>> define WORD
define WORD [ Char ]+ ;

# =============================================================
#echo >>> list of abbreviations
#define ABBR [
#Mr. | Mrs. | Ms.
#| etc. | e.g. | i.e.
#| ltd. | Ltd. | inc. | Inc.
#];

# =============================================================
echo >>> regular abbreviations
define Letter [A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z|
Å|Ä|Ö|Ø|Æ|Á|É|Ó|Ú|Í|À|È|Ò|Ù|Ì|Ë|Ü|Ï|Â|Ê|Ô|Û|Î|Ã|Ý|þ|Ñ|Ð|
a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z
å|ä|ö|ø|æ|á|é|ó|ú|í|à|è|ò|ù|ì|ë|ü|ï|â|ê|ô|û|î|ã|ý|þ|ñ|ð|ç|ß|'];
define INIT [ Letter %. ]+ ;

# =============================================================
echo >>> numeric expressions
define Digit [ %0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9];
define NumOp [ %- | %+ | %* | %/ | %= | %: ];
define NumSep [ %. | %, ];
define NUM [ Digit | NumOp | NumSep]+ & $[Digit] ;
# =============================================================
echo >>> some multi-words
# enclosed by {} since the letters shall be read as separate symbols
define MWE [
 {keffrys ha}
| {kekeffrys ha} 
| {kehys ha} 
| {bys pan} 
| {kyns es} 
| {kepar ha pan}
] ;

# marker for multi-words:
define M1 "<<" ;
define M2 ">>" ;
define MWE1 [M1 MWE M2];

# =============================================================
echo >>> define tokens
define Token [ WORD | SYMBOL | ABBR | INIT | NUM | MWE1 ];
# =============================================================
echo >>> longest match
define Bound [ SINGLE | WS | .#. ] ;

define TOK1 [
MWE @-> M1 ... M2 || Bound _ Bound
.o. Token @-> ... NL
.o. [M1|M2] -> 0
];
echo >>> normalize space
define TOK0 [ WS+ @-> SP ];
echo >>> remove spaces
define WS1 [WS]+ & $[NL] ;
define TOK2 [ WS1 @-> NL ] ;
# =========================================================
echo >>> compose
read regex [TOK0 .o. TOK1 .o. TOK2 ];
# absorb the special brackets into the unknown alphabet
substitute symbol ? for "<<"
substitute symbol ? for ">>"

# echo >>> longest match--insert a newline after each token
# define TOK1 [ Token @-> ... NL ] ;
# echo >>> map spaces to a newline
# define TOK2 [ [WS]+ @-> NL ];
# 
# # =============================================================
# echo >>> compose
# read regex [TOK1 .o. TOK2 ];
invert net
#save stack tok.fst