# ============================================================= # CONTENT: Finite-State Tokenizer (no multi-words) # AUTHOR: Original version by Anne Schiller, copied from the # Karttunen/Beesley book on finite state grammar # Modified and extended by Trond Trosterud, 2001-2003. # CREATED: 12-Jun-1997 # UPDATED: 05-Sep-2001 - 15-Feb-2003 # ============================================================= # Usage: xfst -f [ThisFile] # ============================================================= # Thomas testing... delete clear stack echo >>> define white space define SP " "; define TAB "\t"; define NL "\n"; define WS [SP|NL|TAB]; # ============================================================= echo >>> define single character symbols define SINGLE [ %" | %« | %» | %. | %, | %; | %: | %! | %? | %( | %) | %[ | %] | %{ | %} | %/ | %% | %' ]; # Added %' to include 'dat' pro "dat", this may crash due to the grade 3 mark. # The parser errouneously #stops# for a % mark, awaits the next one # (cf. trying 987% in xfst lookup mode). # Added the string ' | %/' above, for ja/dahje cases. define PUNCT [ %. %. (%.) | %' %' | %' %' | %, %, ] ; define Char \[ WS | SINGLE ] ; # ============================================================= echo >>> define SYMBOL #define SYMBOL [ SINGLE | PUNCT ] ; define SYMBOL [ SINGLE | PUNCT | EXTRAPERIOD ] ; echo >>> define WORD define WORD [ Char ]+ ; # ============================================================= echo >>> regular abbreviations define Capital [ Č|Đ|Ŋ|Š|Ŧ|Ž| A|B|C|D|E|F|G|H|I|J|K|L|M|N|O|P|Q|R|S|T|U|V|W|X|Y|Z| Å|Ä|Ö|Ø|Æ|Á|É|Ó|Ú|Í|À|È|Ò|Ù|Ì|Ë|Ü|Ï|Â|Ê|Ô|Û|Î|Ã|Ý|þ|Ñ|Ð ] ; define Small [ č|đ|ŋ|š|ŧ|ž| a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z| å|ä|ö|ø|æ|á|é|ó|ú|í|à|è|ò|ù|ì|ë|ü|ï|â|ê|ô|û|î|ã|ý|þ|ñ|ð|ç|ß]; # ============================================================= echo >>> numeric expressions define Digit [ %0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9]; define NumOp [ %- | %+ | %* | %/ | %= | %: ]; define NumSep [ %. | %, | %: ]; # %: added to original, for 1993:18 etc. define NUMSPAN [ Digit+ [(SP) [%-|%*] (SP)] Digit+ ] ; !define NUM [ Digit | NumOp | NumSep]+ & $[Digit] ; define NUM ( [ Digit | NumOp | NumSep]+ (SP)) Digit+ (NumSep) (NumOp) ; define NUMOrd NUM %. ; define ROM [ I|V|X|L|C|M ]+ ; # too permissive... define NUMROM [ NUM | ROM | NUMSPAN ] ; define NUMCASED [ NUM+ %: Small+ ] ; # Does this mean a string of at least one of Digit or NumOp or NumSep, # followed by at least one Digit? # It seems that the preproc errouneously leaves NumOp and NumSep on the # numbers, instead of separating (it splits 987" but not 987: or 987. # Thus, it errouneously tells that a number cannot close a sentence. # ============================================================= define Letter [ Small | Capital ] ; #define INIT [ Capital %. ]+ ; define INIT [ Letter %. ]+ ; # Thus, INIT not only for J. Smith # but also for j. Smith and s. 43. define CAPNUM [ Capital | NUM ] ; # ============================================================= echo >>> list of abbreviations # There are four groups of abbreviations: #<++> INTRANSABBR - ends a snt when foll. by capital letter or number. #<-+> INTRANSNUMABBR - ends a snt when foll. by number. #<+-> INTRANSCAPABBR - ends a snt when foll. by capital letter. #<--> TRANSABBR - does not end a snt. # Many of the abbreviations are commented out, awaiting # a more thorough evaluation, and, while waiting, in # order to shorten compilation time. # They should be re-introduced on an individual # basis. define TRNUMAB [ {Apd} | {Aut} | {bie} | {Bnr} | {Dnr} | {Enr} | {Gnr} | {Pb} | {Rg} | {Rt} | {S.nr} | {Sál} | {Kor} | {O.nr} | {alm} | {anr} | {bet} | {b.c} | {bc} | {bnr} | {čk} | {djd} | {dmu} | {ds} | {enr} | {fnr} | {gnr} | {j} | {jnr} | {kap} | {kr} | {lnr} | {mk} | {nr} | {org.nr} | {Pk} | {pgf} | {rieg} | {rievd} | {rskr} | {s} | {spr} | {dii} | {tii} | {tlf} | {vnr} ] ; # <+c,+n> (+c,+n means clb if followed by capital,number) # Abbreviations that end a sentence if # followed by letter in upper case or a number. # These are typically nouns. define INTRANSABBR [ {1p} | {2p} | {3p} | {a} | {ref} | {abl} | {adj} | {adv} | {advl} | {affekt} | {akk} | {akt} | {all} | {am} | {anat} | {att} | {attr} | {bear} | {bfr} | {biehttalanh} | {a.D} | {at} | {can} | {kan} | {gl.res} | {gt} | {ie} | {Jak} | {kat} | {kl} | {korr} | {kons} | {kto} | {lc} | {lab} | {vt} | {govvalab} | {lat} | {doavttirgrádastip} | {liks} | {lix} | {lingv} | {log} | {med} | {miner} | {mod} | {mod} | {mp} | {nd} | {neg} | {nto} | {obsol} | {op} | {ord} | {oss} | {pa} | {par} | {part.prf} | {pass} | {pm} | {pos} | {post} | {pp} | {ps} | {pt} | {pts} | {pta} | {rbl} | {reg} | {rek} | {sch} | {sfr} | {sic} | {st} | {st.dieđ} | {p.b} | {st.prp} | {tlg} | {vol} | {vulg} | {biol} | {bot} | {Eftf} | {Bq} | {bto} | {ráŋggáštusč} | {ráŋggáštč} | {c} | {ca} | {cal} | {et} | {alii} | {cc} | {cea} | {cg} | {cl} | {cm} | {cm3} | {cos} | {cot} | {co2} | {C U L8R} | {d} | {daa} | {dB} | {dea} | {dearvvašvuođaosd} | {ded} | {dept} | {dg} | {dhj} | {Dj} | {dl} | {dm2} | {dm²} | {dm3} | {dm³} | {do} | {dual} | {dub} | {duor} | {parl} | {vuos} | {maŋ} | {mán} | {dis} | {duorast} | {e} | {eanadoallolávd} | {eaŋg} | {estteg} | {etc} | {euf} | {f} | {f} | {fac} | {ff} | {Fim} | {fin} | {fol} | {fr} | {g} | {gaskkav} | {gaskav} | {dii.oahp} | {ov.sk.oahp} | {gen} | {geogr} | {geom} | {ger} | {ggl} | {ggl.cea} | {gl} | {gl.rs} | {goall} | {Guovdag} | {Guovdg} | {GWh} | {guovl} | {h} | {ha} | {Hedm} | {hg} | {hl} | {hárv} | {Hz} | {i} | {P.S} | {id} | {ID} | {ID} | {ill} | {imp} | {impers} | {impf} | {imprt} | {ind} | {indekl} | {inf} | {info} | {ing} | {instr} | {intr} | {ipmilbálv} | {j.d} | {j.d.s} | {j.e} | {j.s} | {j.v} | {jd} | {jdd} | {je} | {jed} | {jk1} | {jk2} | {jna} | {jnv} | {journ} | {Kp} | {jr} | {js} | {jur} | {jv} | {kb} | {Kbh} | {kg} | {kHz} | {km} | {km2} | {km²} | {km3} | {km³} | {wo} | {komp} | {kond} | {konj} | {konkr} | {Kr} | {kV} | {kvm} | {kW} | {kWh} | {l} | {lm} | {Lohk} | {lok} | {lp} | {lád} | {lávv} | {láv} | {Muf} | {m} | {m.fl} | {matem} | {mearrid} | {mg} | {MHz} | {milj} | {miljo} | {mill} | {mKr} | {ml} | {mm} | {mm2} | {mm²} | {mm3} | {mm³} | {mrd} | {mV} | {mW} | {m2} | {m²} | {m3} | {m³} | {máŋggaidl} | {NewZ} | {njukč} | {o.Kr} | {o.s} | {obj} | {od.prp} | {oss.jođ} | {ped.jođ} | {oKr} | {ol} | {op.cit} | {osd} | {ovttaidl} | {ovttaj} | {ovttajie} | {ovttajien} | {P.E.N} | {part} | {partit} | {pc} | {perf} | {perš} | {pers} | {pf} | {pH} | {phil} | {philos} | {pl} | {postp} | {pot} | {pr} | {prep} | {pres} | {pret} | {prf} | {prol} | {pron} | {prot} | {prs} | {prt} | {prv} | {pst} | {pvc} | {refl} | {rel} | {relat} | {relig} | {res} | {rskr} | {ru} | {ruoŧag} | {ry} | {s.v} | {sek} | {sg} | {sign} | {sl} | {sotn} | {spd} | {Spd} | {stip} | {subj} | {subst} | {suoi} | {suoidn} | {sup} | {sátnečilg} | {tabl} | {tan} | {temp} | {trans} | {transl} | {uŋgárg} | {UiB} | {UiO} | {UiT} | {vk} | {vsu} | {váldolávdeg} | {á} | {Áššemeannud} | {č} | {č.ortn} | {čilg} | {čs} | {đ} | {k} | {n} | {o} | {p} | {q} | {t} | {u} | {v} | {w} | {x} | {y} | {z} ] ; # <-c,+n> # Abbreviations that are only CLB # if followed by a number. # Mainly titles. # <+c,-n> # Abbreviations that only end a sentence # if followed by an upper case letter, # not when followed by a number. define INTRANSCAPABBR [ #{1.Sam.} | #{2.Sam.} | {Akersgt.} | #{B.innst.S.nr.} | #{Besl.L.nr.} | #{Besl.O.nr.} | {Co.} | {Dan.} | {Dok.nr.} | #{Dronningensgt.} | #{Eidsvollsgt.} | #{Elvegt.} | #{Hausmannsgt.} | #{Holmegt.} | #{Industrigt.} | {Innst.O.nr.} | {Innst.S.nr.} | #{Johansgt.} | {Josv.} | #{Kirkegt.} | {Kong.} | #{Kongensgt.} | #{Langgt.} | {Mark.} | {Matt.} | {Mos.} | #{Musegt.} | {Nkr.} | {Ob.} | #{Olavsgt.} | #{Oslov.} | {Ot.prp.nr.} | {Paul.} | #{Prinsensgt.} | {Rom.} | #{Rådhusgt.} | {Sam.} | #{Slottsgt.} | {St.dieđ.nr.} | {St.meld.nr.} | {St.prp.nr.} | {Storgt.} | {Tel.} | {Tlf.nr.} | {Tlf.} | #{Strandgt.} | #{Trondheimsv.} | #{a.D.} | {bd.} | {borg.} | {čakč.} | {cuoŋ.} | #{da.} | {geas.} | {gnr.} | {golg.} | {guov.} | #{it.} | {jnr.} | {juov.} | {kap.} | {kr.} | {lnr.} | {ltd.} | {ltr.} | {maks.} | {mar.} | {mask.} | {mies.} | {mobiltlf.} | {njuk.} | {no.} | {nr.} | {ođđj.} | {p.b.} | {pb.} | {pgf.} | {ru.} | {rv.} | {skáb.} | {suoi.} ] ; # dot% noStb.db # <-c,-n> # Abbreviations that never end a sentence, also if followed # by a capital letter or a number. define TRANSABBR [ {1.aman} | {Adr} | {aman} | {anár} | {art} | {adr} | {avd} | {avd.dir} | {Avd.dir} | {b} | {bd} | {biht} | {Bj} | {bbl} | {borgem} | {bnr} | {buo} | {cand.mag} | {cand.oecon} | {cand.philol} | {cand.real} | {cand.scient} | {cand.theol} | {Chr} | {cuoŋom} | {cuoŋ} | {d.l} | {d.g} | {darj} | {davvisuopm} | {dbm} | {dearvvašv} | {Dep} | {dhj} | {dieđ} | {dieđ.nr} | {dipl.ing} | {dipl.ins} | {dir} | {dr} | {dr.art} | {esc} | {dr.med} | {dr.philos} | {dr.poilit} | {dr.theol} | {dáb} | {e.e} | {ea} | {ea.ea} | {Edv} | {ee} | {Ee} | {eksp} | {eksp.hoavd} | {eksp.sj} | {eren} | {erenoamášped} | {evtt} | {ex} | {ex.fac} | {eŋg} | {fax} | {fenr} | {fig} | {fil.tri} | {Gnr} | {geassem} | {geas} | {gen.lt} | {genr} | {genr.lt} | {germ} | {gield} | {golggotm} | {guovvam} | {guov} | {gárj} | {geahč} | {gč} | {Gč} | {H.K.H} | {H.M} | {h.r.adv} | {hr} | {inkl} | {Innst} | {institušuvn} | {isl} | {Johs} | {juovlam} | {juov} | {kapt} | {kard} | {koord} | {korp} | {korpr} | {Kr} | {lekt} | {lic} | {lnr} | {ltn} | {Luk} | {lul} | {lullisuopm} | {lágid} | {mag} | {mag.art} | {merc} | {Mield} | {miessem} | {mies} | {milj} | {mr} | {Mr} | {mrs} | {Mrs} | {ms} | {Ms} | {mársm} | {ng} | {njukčam} | {njuk} | {nuort} | {nuortasuopm} | {o.m.d} | {oahp} | {oarjesuopm} | {obl} | {oblt} | {Od.dieđ.nr} | {Od.prop} | {Od.prp} | {Od.prp.nr} | {odont} | {oecon} | {omd} | {ordf} | {ossod.dir} | {Ot} | {Ot.prop.nr} | {Ot.prp} | {Ot.prp.nr} | {Oth.prp} | {ovd} | {ovdasu} | {overs} | {ođđj} | {ođđaj} | {ođđajagem} | {ped} | {pharm} | {philol} | {pol} | {polit} | {priv} | {pro} | {prof} | {Proavássb} | {prop} | {prp} | {psychol} | {psyk} | {r} | {red} | {res.kap} | {resipr} | {Sadj.b} | {Sadjb} | {Savkadasv} | {sb} | {Sd} | {Sd.dieđ.nr} | {Sd.prp} | {Sd.prp.nr} | {Seb} | {skábmam} | {sos} | {spes.ped} | {St} | {St.d.nr} | {St.died} | {St.dieđ} | {St.dieđ.nr} | {St.meld} | {St.meld.nr} | {St.nr} | {St.prp} | {St.prp.nr} | {stud} | {stud.philol} | {sugr} | {suoidnem} | {suopm} | {suorgg} | {sá} | {sásu} | {tel} | {Th} | {tlf} | {tri} | {Turnusbálv} | {ubm} | {urál} | {Vik.pr} | {vit.ass} | {vrd} | {Vrd} | {vs} | {vsá} | {Vsá} | {čakčam} | {čakč} | {čuo} | {A} | {B} | {C} | {D} | {E} | {F} | {G} | {H} | {I} | {J} | {K} | {L} | {M} | {N} | {O} | {P} | {Q} | {R} | {S} | {T} | {U} | {V} | {W} | {X} | {Y} | {Z} | {Æ} | {Ø} | {Å} | {Ä} | {Ö} | {Č} | {Š} ] ; echo >>> collecting all abbreviations into one set define ABBR [ INTRANSABBR | INTRANSCAPABBR | TRANSABBR ] ; # ============================================================= echo >>> some multi-words define MWE [ {feara manin} | {C U L8R} ] ; # Marker for multi-words define M1 "<<" ; define M2 ">>" ; define MWE1 [M1 MWE M2] ; # ============================================================= echo >>> define web addresses define URL [ [{http://www.}|{www.}] [ Char | SINGLE ]+ ]; # ============================================================= echo >>> define tokens define Token [ WORD | SYMBOL | ABBR | INIT | NUMOrd | NUMCASED | NUM | MWE1 | URL ]; # some non-functioning versions left for evaluation here. # define Token [ WORD | SYMBOL | TRANSABBR | INIT | NUM ]; # define Token [ WORD | SYMBOL | ABBR | INIT | NUM | NUMCASED ]; # define Token [ # WORD | SYMBOL | TRANSCAPABBR | TRANSNUMABBR | TRANSABBR | INIT | NUM ]; # ============================================================== echo >>> finding titles etc. #define TOK3 [ NL -> NL %. || _ NL ] ; # 2209 out! # I take two newlines, and insert a period in between them. # The result I call TOK3, and I insert it in the regex line at the # bottom of this file. # ============================================================= # handling the abbreviations echo >>> newline and period copying after numeral define TOKNUMOrd [ NUMOrd @-> ... NL %. NL || _ WS+ Capital ] ; echo >>> newline and period copying after abbr. that are intr. wrt. capitals define TOKINTRANSCAP [ INTRANSCAPABBR @-> ... NL %. NL || _ WS+ Capital ] ; # echo >>> newline and period copying after abbr. that are intr. wrt. numbers # define TOKINTRANSNUM [ INTRANSNUMABBR @-> ... NL %. NL || _ WS+ NUMROM ] ; # echo >>> newline and period copying after abbr. when followed by # capital OR Num # define TOKTRNUMAB [ TRNUMAB @-> ... NL %. NL || _ WS+ CAPNUM ] ; echo >>> newline and period copying after abbr. that are always intr. define TOKINTRANS [ INTRANSABBR @-> ... NL %. NL || _ WS+ CAPNUM ] ; #echo >>> fix the 34. #define TOK01 [ NUMOrd @-> ... NL ] ; echo >>> longest match--insert a newline after each token # multi start define Bound [ SINGLE | WS | .#. ]; define TOK1 [ MWE @-> M1 ... M2 || Bound _ Bound .o. Token @-> ... NL .o. [M1|M2] -> 0 ] ; echo >>> normalize space define TOK0 [ WS+ @-> SP ]; echo >>> remove spaces define WŠ [WS]+ & $[NL] ; #define TOK1 [ Token @-> ... NL ] ; echo >>> map spaces to a newline define TOK2 [ [WŠ]+ @-> NL ]; #define TOK2 [ [WS]+ @-> NL ]; # multi end, remove this note when working # ============================================================= echo >>> compose the above read regex [TOKNUMOrd .o. TOKINTRANSCAP .o. TOKTRNUMAB .o. TOKINTRANS # .o. TOK1 .o. TOK2 .o. TOK3 ]; # 2209 out! # .o. TOK1 .o. TOK2 ]; # this one works! .o. TOK0 .o. TOK1 .o. TOK2 ]; invert net