!! Requires a recent version of HFST (3.10.0 / git revision>=3aecdbc)
!! Then just:
!! $ make
!! $ echo "ja, ja" | hfst-tokenise --gtd tokeniser-disamb-gt-desc.pmhfst

!! Issues:
!! - [X] Ambiguous input
!!   - Seems to work fine
!! - [X] Ambiguous multiword expessions with ambiguous tokenisation
!!   - Seems to work – represented within lexc now; hfst-tokenise also
!!     supports forms on the analyses now
!! - [X] Ambiguous multiword expessions need reorganising after CG
!!   - The module cg-mwesplit takes wordforms from readings and turns them into
!!     new cohorts
!! - [X] Unknown words
!!   - The set-difference method only works for words without
!!     flag diacritics (even though we should be working only on the form-side?)
!!     and leads to binary blow-up: With only lower unknowns, we get 45M;
!!     lower+upper gives 67M, while no unknowns gives 27M
!!   - Fixed instead by treating empty analyses as unknown-tokens in
!!     hfst-tokenise, and outputting unmatched strings with a prefix
!! - [ ] Treat input that's within superblanks as unmatched
!!   - probably requires a change in hfst-tokenise itself
!! - [X] Try >1 space for ambiguous MWE's? – represented within lexc now
!! - [ ] Try set-difference-unknowns method with regular hfst commands?

!! More usage examples:
!! $ echo "Juos gorreválggain lea (dárbbašlaš) deavdit gáibádusa boasttu olmmoš, man mielde lahtuid." | hfst-tokenise --gtd tokeniser-disamb-gt-desc.pmhfst
!! $ echo "(gáfe) 'ja' ja 3. ja? ц jaja ukjend \"ukjend\"" | hfst-tokenise --gtd tokeniser-disamb-gt-desc.pmhfst
!! $ echo "márffibiillagáffe" | hfst-tokenise --gtd tokeniser-disamb-gt-desc.pmhfst

!! Pmatch documentation:
!! https://kitwiki.csc.fi/twiki/bin/view/KitWiki/HfstPmatch

set need-separators off

Define morphology @bin"analyser_relabelled-gramcheck-gt-desc.hfst" ;

Define url @bin"../../src/morphology/url.hfst".i ;

Define incondform      Punct | {“} | {”} | {…} | {­} ;

Define blank           Whitespace | incondform ;

Define incondword       morphology & [ incondform 0:?* ] ;
Define morphoword       morphology                   LC([blank | #]) RC([blank | # ]);
Define urlword          url                          LC([blank | #]) RC([blank | # ]);

!! Apart from what's in our morphology, there are
!! 1) unknown word-like forms, and
!! 2) unmatched strings
!! We want to give 1) a match, but let 2) be treated specially by hfst-tokenise -a
Define alphabet "a-z" | {á}|{š}|{ž}|{č}|{đ}|{ŋ}|{ŧ}|{æ}|{ä}|{ø}|{ö}|{å}
              | "A-Z" | {Á}|{Š}|{Ž}|{Č}|{Đ}|{Ŋ}|{Ŧ}|{Æ}|{Ä}|{Ø}|{Ö}|{Å} ;
Define alphaword alphabet+;
!! TODO: Could use something like this, but built-in's don't include šžđčŋ:
! Define MixCase(X) [OptCap(X) | UpCase(X)];
! Define alphaword MixCase(alphabet+);


!!! Unknown handling:
Define unknownform [alphaword].u ;
!! Simply give an empty reading when something is unknown:
Define unknownwordEmpty unknownform:0 LC([blank | #]) RC([[blank ] | # ]);
!! hfst-tokenise --gtd will treat such empty analyses as unknowns, and
!! remove empty analyses from other readings. Empty readings are also
!! legal in CG, they get a default baseform equal to the wordform, but
!! no tag to check, so it's safer to let hfst-tokenise handle them.


!!! Superblank handling (TODO):
! Define anyExceptEsc [ ? - [ %\ ] ];
! Define unescaped [ anyExceptEsc | %\ ? | # ]+;
! Define anyExceptSuperendEsc [ ? - [ %\ | %] ] ];
! Define notSuperend [ %\ ? | anyExceptSuperendEsc ];
! Define superblank %[ notSuperend* %] EndTag(superblank) LC(unescaped) ;
! Define tokenOrBlank [ token | superblank ];
!! Needs hfst-tokenise to output things differently depending on the tag they get


Define token [ morphoword | unknownwordEmpty | incondword | Ins(urlword) ] EndTag(token);

regex token ;