# -*- cg-pre-pipe: "hfst-tokenise -g ../../tools/preprocess/tokeniser-disamb-gt-desc.pmhfst" -*- # ===================== # # Faroese disambiguator # # ===================== # # ========== # # Delimiters # # ========== # DELIMITERS = "<.>" "" "" "<¶>" sent ; # ============= # # Tags and sets # # ============= # # ======== SETS # ======== LIST BOS = (>>>) (); LIST EOS = (<<<) (); # vislcg and CG-2 together. LIST N = N ; LIST V = V ; LIST A = A ; LIST A* = A* ; LIST Prop = Prop ; LIST Adv = Adv ; LIST Num = Num ; LIST CC = CC ; LIST CS = CS ; LIST Interj = Interj ; LIST Abbr = Abbr ; LIST ACR = ACR ; LIST Pr = Pr ; LIST Pron = Pron ; LIST Pers = Pers ; LIST Det = Det ; LIST Dem = Dem ; LIST Refl = Refl ; LIST Recipr = Recipr ; LIST Poss = Poss ; LIST Interr = Interr ; LIST IM = IM ; LIST Nom = Nom ; LIST Acc = Acc ; LIST Gen = Gen ; LIST Dat = Dat ; LIST Msc = Msc ; LIST Fem = Fem ; LIST Neu = Neu ; LIST Sg = Sg ; LIST Pl = Pl ; LIST Def = Def ; LIST Indef = Indef ; LIST Cmp = Cmp ; LIST Superl = Superl ; LIST Prs = Prs ; LIST Prt = Prt ; LIST 1Sg = 1Sg ; LIST 2Sg = 2Sg ; LIST 3Sg = 3Sg ; LIST Inf = Inf ; LIST PrfPtc = PrfPtc ; LIST PrsPrc = PrsPrc ; LIST Ind = Ind ; LIST Imp = Imp ; LIST Sbj = Sbj ; LIST Pass = Pass ; LIST Sup = Sup ; LIST Cmpnd = Cmpnd ; LIST CLB = CLB ; LIST CLBfinal = CLBfinal ; # because common num LIST PUNCT = PUNCT ; LIST LEFT = LEFT ; LIST RIGHT = RIGHT ; LIST Guess = Guess ; LIST Err/Guess = Err/Guess ; LIST Der/A = Der/A ; LIST Sem/ID = Sem/ID ; LIST Arab = Arab ; LIST Ord = Ord ; LIST Err/Orth = Err/Orth ; LIST CURRENCY = "dinar" "dollar" "euro" "krona" "kr" "rubel" "¢" "€" "$"; LIST Sem/Fem = Sem/Fem ; LIST Sem/Mal = Sem/Mal ; LIST Sem/Plc = Sem/Plc ; LIST Sem/Sur = Sem/Sur ; LIST Sem/Org = Sem/Org ; LIST Sem/Veh = Sem/Veh ; LIST Sem/Year = Sem/Year ; LIST TAG = N V A Adv CC CS Interj Pr Pron Pers Det Refl Recipr Poss Nom Acc Gen Dat Msc Fem Neu Sg Pl Def Indef Cmp Sup Prs Prt 1Sg 2Sg 3Sg Inf PrfPtc PrsPrc Sup Imp CLB PUNCT LEFT RIGHT Sem/Fem Sem/Mal Sem/Plc Sem/Sur Sem/Org Sem/Veh Guess ; LIST NAGD = Nom Acc Gen Dat ; LIST AGD = Acc Gen Dat ; LIST GENDER = Msc Fem Neu ; LIST NUMBER = Sg Pl ; SET NAGDNUMBERGENDER = (Msc Sg Nom) OR (Msc Sg Acc) OR (Msc Sg Gen) OR (Msc Sg Dat) OR (Msc Pl Nom) OR (Msc Pl Acc) OR (Msc Pl Gen) OR (Msc Pl Dat) OR (Fem Sg Nom) OR (Fem Sg Acc) OR (Fem Sg Gen) OR (Fem Sg Dat) OR (Fem Pl Nom) OR (Fem Pl Acc) OR (Fem Pl Gen) OR (Fem Pl Dat) OR (Neu Sg Nom) OR (Neu Sg Acc) OR (Neu Sg Gen) OR (Neu Sg Dat) OR (Neu Pl Nom) OR (Neu Pl Acc) OR (Neu Pl Gen) OR (Neu Pl Dat) ; # Sets # ==== SET WORD = N | V | A | Pr | Pron | Det | Adv | CC | CS | Interj | Num | ("\?") ; # Noun sets SET NounMscFem = (N Msc) OR (N Fem) ; SET NounMscNeu = (N Msc) OR (N Neu) ; SET NounFemNeu = (N Fem) OR (N Neu) ; SET MscFem = Msc OR Fem ; SET MscNeu = Msc OR Neu ; SET FemNeu = Fem OR Neu ; LIST 3PERS = "hann" "hon" "tað" ; LIST MONTH = "januar" "februar" "desember" "september" "november" "oktober" "desembur" "septembur" "novembur" "oktobur" "mars" "august" "april" "mai" "juni" "juli" ; # Adjective sets SET REALADJ = A OR A* ; # Nominal sets LIST NOMINALHEAD = N Prop Num Pers Refl Recipr (Pron Interr) (Pron Indef) ; LIST NorA = N A ; LIST PRENAGR = Det A ; LIST MIDJA = "miðja" ; # in the middle of - construction. More N in here, also bottle constr. SET NOT-ACC = TAG - Acc ; # Verb sets LIST COPULA = "vera" "verða" ; LIST HAVA = "hava" ; LIST MODV = "kunna" "láta" "skula" "vilja" "munna" "mega" ; SET AUX = COPULA OR HAVA OR MODV ; LIST VFIN = Prs Prt Imp Sbj ; LIST INDSBJ = Ind Sbj ; #LIST Ind = Prs Prt ; LIST NONTHIRDV = 1Sg 2Sg 1Pl 2Pl ; LIST VINFIN = Inf Sup PrfPrs PrsPrc ; LIST DATV = "bjóða" "hýsa" "sleppa" "smakka" "takka" ; # "undirvísa" # sleppa3 = sleppa manninum leysum LIST DATPREPV = "liggja" "vera" "standa" "fiska" "vera"; # use only ACCPREPV LIST ACCPREPV = "leggja" "fara" "renda" "koma" "liða" "seta" ; LIST OBJPREDVERB = "kalla" "doypa" "nevna" "taka" ; LIST MOVEMENTVERB = "koma" "koyra" ; SET TV = V - COPULA - MODV - DATV ; LIST SAFETV = "eiga" ; # Noun-Verb sets LIST THIRDSG = Sg 3Sg ; LIST THIRDPL = Sg 3Pl ; LIST THIRD = Sg 3Sg Pl 3Pl ; # Number sets SET NUMBERS = Num - ("eitt") - ("1") - Sem/Year ; # Preposition sets LIST ACCPREP = "aftan" "aftanvert" "ábeint" "áraka" "foruttan" "gjøgnum" "hóast" "inntil" "ígjøgnum" "íkring" "kring" "niðan" "oman" "síðan" "umframt" "umhvørvis" "umkring" "báðumegin" "hasumegin" "hvørgumegin" "høgrumegin" "sínumegin" "skeivumegin" "somumegin" "vinstrumegin" "øðrumegin" "eystan" "norðan" "sunnan" "sum" "vestan" ; LIST DATPREP = "hjá" "með" "aftrat" "afturat" "afturímóti" "andstøðis" "andsýnis" "frá" "mótvegis" "nær" "sambært" "viðvíkjandi" "úr" ; # úr? LIST ACCDATPREP = "í" "á" "undir" "yvir" "eftir" "fyri" "við" ; LIST ACCGENPREP = "innan" "uttan" "millum" "til" "vegna" ; LIST ACCDATGENPREP = "at" "av" "hjá" "móti" "um" "undan" "úr" ; SET SOMEACCPREP = ACCPREP | ACCDATPREP | ACCGENPREP | ACCDATGENPREP ; SET SOMEDATPREP = DATPREP | ACCDATPREP | ACCDATGENPREP ; SET SOMEGENPREP = ACCGENPREP | ACCDATGENPREP ; SET SOMEACCDATPREP = ACCDATPREP | ACCDATGENPREP ; # Boundary sets SET S-BOUNDARY = CS | Interr | (";") | (":") | BOS | EOS ; SET S-BOUNDARY2 = CS | Interr | (";") | (":") | BOS | EOS ; # Complementary set # Case sets # Some case, but not... SET NOTNOM = Dat | Gen | Acc ; SET NOTDAT = Nom | Gen | Acc ; SET NOTACC = Nom | Gen | Dat ; SET NOTACCDAT = Nom | Gen ; SET OBL = Acc | Dat | Gen ; # Anything but the following case... SET NOACC = WORD - Acc ; SET NODAT = WORD - Dat ; SET NOACCDAT = WORD - Acc - Dat ; #SET NO SET PRE-N = A | Det | (N Gen) | Num | (Pron Gen) | CC ; # Det??? LIST COMMA = (",") ; SET MARK = COMMA | ("\\") | ("\;") ; #" LIST PUNCT-LEFT = (PUNCT LEFT) ; LIST PUNCT-RIGHT = (PUNCT RIGHT) ; SET PRE-APP = COMMA OR PUNCT-LEFT ; SET WORDMARK = WORD | MARK ; SET NPNH = WORDMARK - PRE-N ; SET NPNHA = WORDMARK - PRE-N - Adv ; SET NOT-ADV = WORDMARK - Adv - CC ; # adding CC SET NOT-PROP = WORDMARK - Prop - CC ; SET LEX-ADV = Adv - (A*) ; SET NOT-A = WORDMARK - A ; SET NOT-CC = WORDMARK - CC ; SET NOUNADJ = N | A ; SET NP-MEMBER = PRE-N | N | Pron ; LIST TIME = "sunnudagur" ; # Semantic sets LIST ABSTRACT = "ráð" "byráð" ; LIST BAREPLURALS = "barn" "fólk" "bygdafólk" "konufólk" ; # these words are usually plurals when used alone. All countable neuters? ## Tú sært barn standa. LIST PLACEADVERB = "ímillum" ; # Word sets LIST TAD = "tað" ; ######################## ######################### ######################## ######################### # disambiguation # ######################## ######################### ######################## ######################### BEFORE-SECTIONS SECTION