# -*- cg-pre-pipe: "$GTHOME/giella-core/scripts/preprocess --abbr=$GTHOME/langs/fao/tools/preprocess/abbr.txt | hfst-optimised-lookup $GTHOME/langs/fao/src/analyser-disamb-gt-desc.hfstol | $GTHOME/giella-core/scripts/lookup2cg" -*- # ===================== # # Faroese disambiguator # # ===================== # # ========== # # Delimiters # # ========== # DELIMITERS = "<.>" "" "" "<¶>" sent ; # ============= # # Tags and sets # # ============= # # ======== SETS # ======== LIST BOS = (>>>) (); LIST EOS = (<<<) (); # vislcg and CG-2 together. LIST N = N ; LIST V = V ; LIST A = A ; LIST A* = A* ; LIST Prop = Prop ; LIST Adv = Adv ; LIST Num = Num ; LIST CC = CC ; LIST CS = CS ; LIST Interj = Interj ; LIST Abbr = Abbr ; LIST ACR = ACR ; LIST Pr = Pr ; LIST Pron = Pron ; LIST Pers = Pers ; LIST Det = Det ; LIST Dem = Dem ; LIST Refl = Refl ; LIST Recipr = Recipr ; LIST Poss = Poss ; LIST Interr = Interr ; LIST IM = IM ; LIST Nom = Nom ; LIST Acc = Acc ; LIST Gen = Gen ; LIST Dat = Dat ; LIST Msc = Msc ; LIST Fem = Fem ; LIST Neu = Neu ; LIST Sg = Sg ; LIST Pl = Pl ; LIST Def = Def ; LIST Indef = Indef ; LIST Cmp = Cmp ; LIST Superl = Superl ; LIST Prs = Prs ; LIST Prt = Prt ; LIST 1Sg = 1Sg ; LIST 2Sg = 2Sg ; LIST 3Sg = 3Sg ; LIST Inf = Inf ; LIST PrfPrc = PrfPrc ; LIST PrsPrc = PrsPrc ; LIST Ind = Ind ; LIST Imp = Imp ; LIST Sbj = Sbj ; LIST Pass = Pass ; LIST Sup = Sup ; LIST Cmpnd = Cmpnd ; LIST CLB = CLB ; LIST PUNCT = PUNCT ; LIST LEFT = LEFT ; LIST RIGHT = RIGHT ; LIST Guess = Guess ; LIST Sem/Fem = Sem/Fem ; LIST Sem/Mal = Sem/Mal ; LIST Sem/Plc = Sem/Plc ; LIST Sem/Sur = Sem/Sur ; LIST Sem/Org = Sem/Org ; LIST Sem/Veh = Sem/Veh ; LIST TAG = N V A Adv CC CS Interj Pr Pron Pers Det Refl Recipr Poss Nom Acc Gen Dat Msc Fem Neu Sg Pl Def Indef Cmp Sup Prs Prt 1Sg 2Sg 3Sg Inf PrfPrc PrsPrc Sup Imp CLB PUNCT LEFT RIGHT Sem/Fem Sem/Mal Sem/Plc Sem/Sur Sem/Org Sem/Veh Guess ; LIST NAGD = Nom Acc Gen Dat ; LIST AGD = Acc Gen Dat ; LIST GENDER = Msc Fem Neu ; LIST NUMBER = Sg Pl ; SET NAGDNUMBERGENDER = (Msc Sg Nom) OR (Msc Sg Acc) OR (Msc Sg Gen) OR (Msc Sg Dat) OR (Msc Pl Nom) OR (Msc Pl Acc) OR (Msc Pl Gen) OR (Msc Pl Dat) OR (Fem Sg Nom) OR (Fem Sg Acc) OR (Fem Sg Gen) OR (Fem Sg Dat) OR (Fem Pl Nom) OR (Fem Pl Acc) OR (Fem Pl Gen) OR (Fem Pl Dat) OR (Neu Sg Nom) OR (Neu Sg Acc) OR (Neu Sg Gen) OR (Neu Sg Dat) OR (Neu Pl Nom) OR (Neu Pl Acc) OR (Neu Pl Gen) OR (Neu Pl Dat) ; # Sets # ==== SET WORD = N | V | A | Pr | Pron | Det | Adv | CC | CS | Interj | Num | ("\?") ; # Noun sets SET NounMscFem = (N Msc) OR (N Fem) ; SET NounMscNeu = (N Msc) OR (N Neu) ; SET NounFemNeu = (N Fem) OR (N Neu) ; SET MscFem = Msc OR Fem ; SET MscNeu = Msc OR Neu ; SET FemNeu = Fem OR Neu ; LIST 3PERS = "hann" "hon" "tað" ; # Adjective sets SET REALADJ = A OR A* ; # Nominal sets LIST NOMINALHEAD = N Prop Num Pers Refl Recipr (Pron Interr) (Pron Indef) ; LIST NorA = N A ; LIST PRENAGR = Det A ; LIST MIDJA = "miðja" ; # in the middle of - construction. More N in here, also bottle constr. SET NOT-ACC = TAG - Acc ; # Verb sets LIST COPULA = "vera" "verða" ; LIST HAVA = "hava" ; LIST MODV = "kunna" "láta" "skula" "vilja" "munna" "mega" ; SET AUX = COPULA OR HAVA OR MODV ; LIST VFIN = Prs Prt Imp Sbj ; LIST INDSBJ = Ind Sbj ; #LIST Ind = Prs Prt ; LIST NONTHIRDV = 1Sg 2Sg 1Pl 2Pl ; LIST VINFIN = Inf Sup PrfPrs PrsPrc ; LIST DATV = "bjóða" "hýsa" "sleppa" "smakka" "takka" ; # "undirvísa" # sleppa3 = sleppa manninum leysum LIST DATPREPV = "liggja" "vera" "standa" "fiska" "vera"; # use only ACCPREPV LIST ACCPREPV = "leggja" "fara" "renda" "koma" "liða" "seta" ; LIST OBJPREDVERB = "kalla" "doypa" "nevna" "taka" ; LIST MOVEMENTVERB = "koma" "koyra" ; SET TV = V - COPULA - MODV ; # Noun-Verb sets LIST THIRDSG = Sg 3Sg ; LIST THIRDPL = Sg 3Pl ; LIST THIRD = Sg 3Sg Pl 3Pl ; # Number sets SET NUMBERS = Num - ("eitt") - ("1") ; # Preposition sets LIST ACCPREP = "aftan" "aftanvert" "ábeint" "áraka" "foruttan" "gjøgnum" "hóast" "inntil" "ígjøgnum" "íkring" "kring" "niðan" "oman" "síðan" "umframt" "umhvørvis" "umkring" "báðumegin" "hasumegin" "hvørgumegin" "høgrumegin" "sínumegin" "skeivumegin" "somumegin" "vinstrumegin" "øðrumegin" "eystan" "norðan" "sunnan" "sum" "vestan" ; LIST DATPREP = "hjá" "með" "aftrat" "afturat" "afturímóti" "andstøðis" "andsýnis" "frá" "mótvegis" "nær" "sambært" "viðvíkjandi" "úr" ; # úr? LIST ACCDATPREP = "í" "á" "undir" "yvir" "eftir" "fyri" "við" ; LIST ACCGENPREP = "innan" "uttan" "millum" "til" "vegna" ; LIST ACCDATGENPREP = "at" "av" "hjá" "móti" "um" "undan" "úr" ; SET SOMEACCPREP = ACCPREP | ACCDATPREP | ACCGENPREP | ACCDATGENPREP ; SET SOMEDATPREP = DATPREP | ACCDATPREP | ACCDATGENPREP ; SET SOMEGENPREP = ACCGENPREP | ACCDATGENPREP ; SET SOMEACCDATPREP = ACCDATPREP | ACCDATGENPREP ; # Boundary sets SET S-BOUNDARY = CS | Interr | (";") | (":") | BOS | EOS ; SET S-BOUNDARY2 = CS | Interr | (";") | (":") | BOS | EOS ; # Complementary set # Case sets # Some case, but not... SET NOTNOM = Dat | Gen | Acc ; SET NOTDAT = Nom | Gen | Acc ; SET NOTACC = Nom | Gen | Dat ; SET NOTACCDAT = Nom | Gen ; SET OBL = Acc | Dat | Gen ; # Anything but the following case... SET NOACC = WORD - Acc ; SET NODAT = WORD - Dat ; SET NOACCDAT = WORD - Acc - Dat ; #SET NO SET PRE-N = A | Det | (N Gen) | Num | (Pron Gen) | CC ; # Det??? LIST COMMA = (",") ; SET MARK = COMMA | ("\\") | ("\;") ; #" LIST PUNCT-LEFT = (PUNCT LEFT) ; LIST PUNCT-RIGHT = (PUNCT RIGHT) ; SET PRE-APP = COMMA OR PUNCT-LEFT ; SET WORDMARK = WORD | MARK ; SET NPNH = WORDMARK - PRE-N ; SET NPNHA = WORDMARK - PRE-N - Adv ; SET NOT-ADV = WORDMARK - Adv - CC ; # adding CC SET NOT-PROP = WORDMARK - Prop - CC ; SET LEX-ADV = Adv - (A*) ; SET NOT-A = WORDMARK - A ; SET NOT-CC = WORDMARK - CC ; SET NOUNADJ = N | A ; SET NP-MEMBER = PRE-N | N | Pron ; LIST TIME = "sunnudagur" ; # Semantic sets LIST ABSTRACT = "ráð" "byráð" ; LIST BAREPLURALS = "barn" "fólk" "bygdafólk" "konufólk" ; # these words are usually plurals when used alone. All countable neuters? ## Tú sært barn standa. # Word sets LIST TAD = "tað" ; ######################## ######################### ######################## ######################### # disambiguation # ######################## ######################### ######################## ######################### SECTION REMOVE Guess ; # If any other reading is possible. # We need some erur rules to really select the adjective erur. TODO REMOVE:NotErur ("erur") ; # Early and popular rules SELECT:r50 Inf IF (-1 ("at")); SELECT:r4 IM IF (0 ("at"))(1C Inf);# TODO: was SELECT CS... should it be? -KBU SELECT:r2 Pr IF (0 ("á"))(1 Dat OR Acc); # at REMOVE:r36 Pr IF (0 ("at"))(NOT *1 Acc OR Gen OR Dat BARRIER V); SELECT:r37 CS IF (0 ("at"))(*1 Nom BARRIER NPNHA LINK *1 V BARRIER OBL); REMOVE:ImForV IM IF (0 ("at"))(NOT 1 Inf); # too strict? REMOVE:rAtN N IF (0 ("at"))(NOT -1 PRE-N) ; ## soleiðis at kvøða einsamallur. # A or N # ====== REMOVE:r35 A IF (*-1 Pr OR CLB BARRIER NPNH)(0 N)(1 S-BOUNDARY OR V); REMOVE (A Fem) IF (0 (N Neu))(-1 COPULA)(-2 ("tað" Nom)); REMOVE (A Msc) IF (0 (N Neu))(-1 COPULA)(-2 ("tað" Nom)); REMOVE N IF (*-1 SOMEDATPREP CBARRIER NPNH)(0 A + Dat)(1 N + Dat) ; REMOVE N IF (*-1 SOMEACCPREP CBARRIER NPNH)(0 A + Acc)(1 N + Acc) ; REMOVE N IF (*-1 SOMEACCDATPREP CBARRIER NPNH)(0 A + Dat OR A + Acc)(1 N + Dat OR N + Acc) ; # TAD REMOVE Det IF (NOT -2 Nom)(-1 V)(0 Pron + Nom)(NOT *1 N + Def BARRIER NPNHA); REMOVE Det IF (0 TAD + $$NAGD)(1C $$NAGD); # Adjective disambiguation # ------------------------ SELECT:copular A + Neu + Sg + Nom + Indef IF (*-1 COPULA BARRIER NPNHA LINK -1 TAD); ## Tað er ógvuliga heitt REMOVE:r18 A + Fem IF (0 $$NAGD) (*1 NounMscNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Fem); REMOVE:r19 A + Msc IF (0 $$NAGD) (*1 NounFemNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Msc); REMOVE:r20 A + Neu IF (0 $$NAGD) (*1 NounMscFem + $$NAGD BARRIER NPNHA LINK NOT 0 Neu); #REMOVE:r18 A + Fem IF (0 $$NAGD) (*1 NounMscNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Fem); #REMOVE:r19 A + Msc IF (0 $$NAGD) (*1 NounFemNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Msc); #REMOVE:r20 A + Neu IF (0 $$NAGD) (*1 NounMscFem + $$NAGD BARRIER NPNHA LINK NOT 0 Neu); SELECT A IF (-3 COPULA)(-2C A)(-1 CC)(1 CLB); SELECT A IF (-1 COPULA)(1 CC)(2C A)(1 CLB); REMOVE (V PrfPrc) IF (NOT *-1 AUX BARRIER S-BOUNDARY)(0 PrfPrc + A); REMOVE:NotPrtWhenA (V Ind Prt Pl) IF (-1 Det OR A)(0 A + $$NAGD)(*1 N + $$NAGD BARRIER NPNH); # Case disambiguation SELECT:TadNom Nom IF (0 TAD)(1 COPULA + 3Sg); SELECT:r21 $$NAGD IF (0C A)(*1C $$NAGD BARRIER NPNHA)(NOT 0 V); #XXX C? SELECT:r21b $$NAGD IF (0C A)(-*1C $$NAGD BARRIER NPNHA)(NOT 0 V); #XXX C? REMOVE:r22 $$NAGD IF (0 A)(*1C N BARRIER NPNHA LINK NOT 0 $$NAGD); REMOVE:AVS OBL IF (*-1 VFIN BARRIER NPNH LINK *-1 BOS OR CLB BARRIER NOT-ADV)(0 (N Nom)); REMOVE:AVS2 OBL IF (-2 CLB)(-1 Adv OR CC OR CS)(0 Nom)(1 V + THIRDSG)(NOT *2 Nom BARRIER CLB); REMOVE Nom IF (-2 V)(-1C Pr)(0 N)(1 CLB); # Too specific, this. REMOVE Dat IF (*-1 V BARRIER NPNH LINK NOT 0 DATV)(0 Acc)(1 EOS OR CS); ## at fáa høvi at... REMOVE:NotAccPredic Acc IF (*-1 COPULA BARRIER NPNH)(0 Nom); ## Í fyrstuni var orðið. REMOVE:SgSgNP Pl IF (-1C Det + Sg)(0 A)(1 N + Sg); REMOVE:PlPlNP Sg IF (-1C Det + Pl)(0 A)(1 N + Pl); # Noun disambiguation SELECT ("hús") IF (0 ("húsi")); # Awaiting real examples with 'húsi' # Conjunctions # Subjunctions SELECT CS IF (-1 BOS)(0 ("tí")); # tí is also a dative pronoun. No instances have been observed # so far. Needed is thus real examples, in order to select them # before going for CS. # PP disambiguation # ================= # Preposition or not? #REMOVE Pr IF (NOT *-1 S-BOUNDARY LINK *1 OBL BARRIER NPNH)(0 Adv)(NOT *1 OBL BARRIER NPNH); REMOVE Pr IF (0 Adv)(NOT *1 OBL BARRIER NPNH); # This rule does not handle stranded prepositions, it must be relaxed. REMOVE:PrAdv Adv IF (*-1 N BARRIER V)(0 Pr + SOMEACCPREP)(*1 Acc BARRIER NPNH); ## Altjóða samstarv innan gransking og útbúgving. ## Hon las síðan bókina. REMOVE:PrCS Pr IF (0 CS)(1C Nom OR VFIN); # á REMOVE:AArule N IF (0 ("á" Indef))(-1C NOMINALHEAD OR V); REMOVE:AArule N IF (0 ("á" Indef))(-1C NOMINALHEAD OR V); # millum REMOVE:r38 Adv IF (0 Dat OR Acc)(*-1 ("millum") BARRIER NPNH); # móti SELECT:moti Pr if (0 ("móti"))(1 Num)(2 Dat); SELECT:moti Pr if (0 ("móti"))(1 Dat); # til SELECT:r39 Pr IF (0 ("til"))(*1 N OR NPNH OR ("at") BARRIER NPNHA); SELECT:r3 Pr IF (0 ("til"))(*1 (N Gen) BARRIER NPNHA); # tíður SELECT ("tíður" A* Adv) IF (-1 VFIN); #(NOT 0 (Det $$NAGD) LINK 1 N + $$NAGD OR A + $$NAGD); ## Tú hugsar títt um bygdafólk. # um SELECT:um1 CS IF (-1 ("sjálvur"))(0 ("um")); SELECT:umCS CS IF (*-1 BOS OR COMMA BARRIER NOT-CC)(0 ("um"))(*1 Nom BARRIER NPNH LINK 1 VFIN); ## Vita, um tú fært talt stjørnumar har! REMOVE:um2 CS IF (0 ("um")); # Case within PP phrases SELECT:r40 Acc IF (*-1 ACCPREP BARRIER NOTACC OR V OR Pr OR Adv); SELECT:r41 Dat IF (*-1 DATPREP BARRIER NOTDAT OR V OR Pr OR Adv); REMOVE:r42 NOTDAT IF (*-1 DATPREP BARRIER NOTDAT OR S-BOUNDARY OR V OR Pr OR Adv OR CLB); #REMOVE Acc IF (*-1 DATPREP BARRIER NOTDAT); #REMOVE Gen IF (*-1 DATPREP BARRIER NOTDAT); REMOVE:r43 Gen IF (*-1 ACCDATPREP BARRIER NOTACCDAT OR S-BOUNDARY OR V OR Pr OR Adv OR CLB); REMOVE:r44 Nom IF (*-1 ACCDATPREP BARRIER NOTACCDAT OR S-BOUNDARY OR V OR Pr OR Adv OR CLB); # CLB makes it weaker, be prepared to compensate. REMOVE:AccPronNotIndef (N Indef) IF (-1 ACCPREP OR ACCDATPREP)(0 (Pron Acc)) ; REMOVE:DatPronNotIndef (N Indef) IF (-1 DATPREP OR ACCDATPREP)(0 (Pron Dat)) ; SELECT:AccNP (N Acc) IF (-1C (Det Acc)); SELECT (A Dat) IF (*-1 ACCDATPREP BARRIER NPNH)(1 (N Dat)); # REMOVE:Fragment Dat IF (*-1 ACCDATPREP BARRIER NPNH)(NOT *-1 V)(NOT *1 V); # but there are such fragments! REMOVE:ACCPREPVERB Dat IF (*-1 ACCDATPREP BARRIER NPNH LINK *-1 ACCPREPV BARRIER NOT-ADV)(0 Acc); REMOVE:DATPREPVERB Acc IF (*-1 ACCDATPREP BARRIER NPNH LINK *-1 DATPREPV BARRIER NOT-ADV)(0 Dat); REMOVE:STRONGDATPREPVERB Acc IF (*-1 SOMEACCDATPREP BARRIER NPNH LINK *-1 S-BOUNDARY BARRIER ACCPREPV)(0 Dat); #REMOVE:STRONGDATPREPVERB Acc IF (*-1 ACCDATPREP BARRIER NPNH LINK *-1 Pr BARRIER ACCPREPV); # BARRIER NOT-ADV)(0 Dat); # POS disambiguation # ================== # Adjectives #kalur REMOVE ("kala") IF (*-1 TAD + Dem)(0 ("kalur")); # Verbs # hava REMOVE ("havi") IF (0 ("hava") + V)(0* Pl + Nom BARRIER Det); # bera SELECT V IF (-1 Adv)(0 ("bar"))(1 ("til")); ## Soleiðis bar til. # vera SELECT:vera ("vera") IF (0 ("ver"))(*-1 ("hava") BARRIER V); # verða SELECT:verda ("verða") IF (0 ("verður"))(*1 A OR Sup CBARRIER NOT-ADV); ## ... sum av tær verður gitin. # kunna SELECT:vfinkunna Inf IF (-1 VFIN) (0 ("kunna")); SELECT:kunna Inf IF (0 ("kunna"))(*1 Inf BARRIER NOT-ADV); # Pron Pers or Det REMOVE Pers IF (-1 Pr)(0 (Det Gen))(*1 N OR A BARRIER NPNH); ## vendu móti hansara bygd. REMOVE Det IF (0 Pers)(1 EOS OR VFIN OR Pr OR CC OR S-BOUNDARY); REMOVE Dem IF (0 Pers)(1 EOS OR VFIN OR Pr OR CC OR Num OR S-BOUNDARY); REMOVE:DetNotDem (Pron Pers) IF (0 Dem + $$GENDER + $$NUMBER + $$NAGD)(1 NorA + $$GENDER + $$NUMBER + $$NAGD); # Det (no idea why $$ does not work) REMOVE (Num Msc Nom) IF (0 (Det Msc Nom))(1 (N Msc Nom) OR (A Msc Nom)); REMOVE (Num Msc Acc) IF (0 (Det Msc Acc))(1 (N Msc Acc) OR (A Msc Acc)); REMOVE (Num Msc Gen) IF (0 (Det Msc Gen))(1 (N Msc Gen) OR (A Msc Gen)); REMOVE (Num Msc Dat) IF (0 (Det Msc Dat))(1 (N Msc Dat) OR (A Msc Dat)); REMOVE (Num Fem Nom) IF (0 (Det Fem Nom))(1 (N Fem Nom) OR (A Fem Nom)); REMOVE (Num Fem Acc) IF (0 (Det Fem Acc))(1 (N Fem Acc) OR (A Fem Acc)); REMOVE (Num Fem Gen) IF (0 (Det Fem Gen))(1 (N Fem Gen) OR (A Fem Gen)); REMOVE (Num Fem Dat) IF (0 (Det Fem Dat))(1 (N Fem Dat) OR (A Fem Dat)); REMOVE (Num Neu Nom) IF (0 (Det Neu Nom))(1 (N Neu Nom) OR (A Neu Nom)); REMOVE (Num Neu Acc) IF (0 (Det Neu Acc))(1 (N Neu Acc) OR (A Neu Acc)); REMOVE (Num Neu Gen) IF (0 (Det Neu Gen))(1 (N Neu Gen) OR (A Neu Gen)); REMOVE (Num Neu Dat) IF (0 (Det Neu Dat))(1 (N Neu Dat) OR (A Neu Dat)); ## eina mynd # Proper nouns #REMOVE:InternalProp Guess IF (NOT -1 CLB OR PUNCT OR LEFT)(NOT 0 Guess); REMOVE:SgProp Pl IF (0C Prop); SELECT:Speciel_Pluralisform N + Pl IF (-1 ("millum" Pr)) ; # Specific lexemes, words # ======================= # aftan SELECT Pr IF (0 ("aftan"))(1 Pr); ## aftan fyri sparikassan... # allur SELECT:ØllumSum Det + Neu + Sg + Dat IF (0 ("allur"))(1 COMMA)(2 ("sum")); # at SELECT CS IF (-2 ("av"))(-1 ("tað"))(0 ("at")); REMOVE Pr IF (0 ("at"))(NEGATE 1 NP-MEMBER); # á SELECT Interj IF (-1 BOS)(0 ("á"))(1 COMMA OR CC); ## Á, eg veit ikki. REMOVE:RemoveInterjAa Interj IF (NOT -1 BOS)(0 ("á")); # ár SELECT (Neu Sg Acc) IF (-1 ("i"))(0 ("ár")); # bara SELECT ("bara" Adv) IF (1 Pron); SELECT ("bara" Adv) IF (*1 N OR A BARRIER NPNH); ## Bara eygað sá roykin fara uppeftir sum óljóð. # eg SELECT:eg ("eg" Pron Pers Pl Nom) IF (*1 (V Pl) BARRIER NOT-ADV); #(*1 V + Pl BARRIER NOT-ADV); ## Vit mugu sleppa. # ein REMOVE ("eini") IF (0 Dat + Dat )(1 A + Dat OR N + Dat) ; # eingin SELECT:eingin ("eingin") IF (-1 ("til"))(0 Gen); ## til einkis. # hafa SELECT ("hava") IF (0 ("høva"))(*1 Sup BARRIER NOT-ADV); SELECT ("hava") IF (0 ("høva"))(-1 Sup); ## Teir høvdu lýtt á kongin. # hann REMOVE:hannur ("hannur") IF (NOT -1 PRE-N)(0 ("hann")); # hannur must be ein hannur or smth # her REMOVE ("hera" Imp) IF (0 ("her")); # Cannot come up with a condition calling for Imp of "hera" # hetta SELECT:hesinNom ("hesin" Pron Dem Neu Sg Nom) IF (1 VFIN)(NOT 2 Nom); SELECT:hesinAcc ("hesin" Pron Dem Neu Sg Acc) IF (1 VFIN)(2C Nom); REMOVE:Hetta ("Hetta" Prop) IF (NOT -1 Sem/Mal); # hon SELECT:hon Pron IF (0 ("hon"))(NOT 0 PRE-N); # húsi REMOVE:húsi ("húsi") IF (*-1 MOVEMENTVERB)(0 ("hús")); ## koma til húsa. # ið SELECT CS IF (-1 CLB)(0 ("ið")); # innan REMOVE ("inni") IF (0 ("innan" Pr))(*1 Acc BARRIER NPNH); ## Samstarv innan gransking. # liggja SELECT ("liggja") + Prt IF (-1 N OR Pron) ; SELECT ("liggja") + Prt IF (1 Pr); # men SELECT CC IF (*-1 CLB OR BOS OR PUNCT BARRIER WORD)(0 ("men")); # munandi SELECT:munandi Adv IF (0 ("munandi"))(1 Adv); # niðan SELECT Adv IF (0 ("niðan"))(1 Pr); # nú REMOVE N IF (0 ("nú" Adv)); ## í núið. # ruður REMOVE ("ruður") IF (0 ("runnur")); # Synonyms # seg SELECT Refl IF (0 ("seg" Refl $$NAGD))(1 ("sjálvur" $$NAGD)); # sjalvur SELECT Adv IF (0 ("sjálvt"))(1 ("um") OR EOS); ## Tey vóru so hugnalig, sjálft um tey sóu hjálparleys út. SELECT:SærSjalvari Dat IF (-1 ("seg" Dat))(0 ("sjálvur")); # skal REMOVE ("skal") IF (0 ("skula"))(1 Inf); # tann SELECT ("tann" Det) IF (1 ("sum")); SELECT ("tann" Det $$NAGD) IF (*1 N + $$NAGD OR A + $$NAGD BARRIER NPNH); # tá and tá ið SELECT Adv IF (0 ("tá"))(1 ("ið") OR V OR Det OR Pron OR N); SELECT Adv IF (-1 ("tá"))(0 ("ið")); SELECT Adv IF (-1 Adv OR NPNH OR COMMA)(0 ("tá"))(1 Det OR Pron OR V); # Og tá eg havi... # Fá boð, tá nýtt er at frætta. SELECT Adv IF (-1 NPNH)(1 Ind OR Sbj); REMOVE:r5 Det IF (0 Pers)(NEGATE 0 Gen LINK -1 N)(NOT 1 NPNHA OR N); # um REMOVE Imp IF (0 ("um")) ; # This is against the verb ymja, súsa, brúsa, dynja, (kvæð.) munnur og nasar umdu í blóði # (blóðið fossaði) # In order to write a good rule for this we need examples of imperative use (which is "um"). # I did not intend to write a 'remove all ymja' rule, but simply cannot come up with good # contexts including 'ymja' but excluding 'um'. # unglingi REMOVE ("unglingi") IF (0 ("unglingur")); # perhaps with regex *lingi vs *lingur # væl REMOVE Imp IF (NOT -1 BOS)(0 ("væl"))(NOT 1 ("um")); # Again, we need real-life examples for imperative of # væla 2 -di s 1: ~ um 1 bøta um, hjálpa upp á, umvæla, ~ um húsini # 2 (sj.) hjúkla um, hugsa ikki um at ~ um meg # inni SELECT Adv IF (-1 V)(0 ("inni"))(1 Pr); # á REMOVE Interj IF (1 WORD)(0 ("á" Pr)); REMOVE Interj IF (-1 V)(0 ("á" Pr)); # Adverbs # General adverb SELECT:r6 Adv IF (-1 DATPREP)(*1 Dat BARRIER NPNHA); SELECT:r7 Adv IF (-1 VFIN)(1 Inf); SELECT:her Adv IF (0 ("herur"))(-1 N OR BOS OR Adv); REMOVE Adv IF (-1 Det OR Pr)(0 A)(1 N); ## ið stakk í bleyta jørð, # Lexicalised adverbs. REMOVE:r566 (A* Adv) IF (0 LEX-ADV) ; ## Serliga er tað ein. # væl SELECT Adv IF (-1 VFIN)(0 ("væl"))(1 IM); # Adverb verbs SELECT:umleid Adv IF (0 ("umleið"))(1 Num); # Idioms SELECT ("stórur" A Neu Sg Dat) IF (-2 ("í"))(-1 ("tað"))(1 ("og"))(2 ("heilur")); SELECT ("heilur" A Neu Sg Dat) IF (-4 ("í"))(-3 ("tað"))(-2 ("stórur"))(-1 ("og")); SELECT ("væl" Adv) IF (1 ("skera" PrfPrc)); SELECT ("skera" PrfPrc) IF (-1 ("væl" Adv)); ## vel skorin SELECT ("innast" Adv) IF (1 ("inni")); ## innast inni # NP internal constraints # ======================= # Determiner disambiguation REMOVE:r8 Neu IF (0 Det + $$NAGD)(NOT 0 Poss + $$NAGD LINK -1 N + $$NAGD)(*1 NounMscFem + $$NAGD BARRIER NPNHA LINK NOT 0 Neu); REMOVE:r9 Msc IF (0 Det + $$NAGD)(NOT 0 Poss + $$NAGD LINK -1 N + $$NAGD)(*1 NounFemNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Msc); REMOVE:r10 Fem IF (0 Det + $$NAGD)(NOT 0 Poss + $$NAGD LINK -1 N + $$NAGD)(*1 NounMscNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Fem); REMOVE:r11 Neu IF (0 A + $$NAGD)(*1 NounMscFem + $$NAGD BARRIER NPNHA LINK NOT 0 Neu); REMOVE:r12 Msc IF (0 A + $$NAGD)(*1 NounFemNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Msc); REMOVE:r13 Fem IF (0 A + $$NAGD)(*1 NounMscNeu + $$NAGD BARRIER NPNHA LINK NOT 0 Fem); REMOVE:Def Def IF (0 A + $$NAGD)(*1 A OR N BARRIER NPNHA LINK 0C $$NAGD + Indef); REMOVE:Def Indef IF (0 A + $$NAGD)(*1 A OR N BARRIER NPNHA LINK 0C $$NAGD + Def); REMOVE Det + $$GENDER IF (*1C N LINK NOT 0 $$GENDER ); SELECT Det + $$GENDER IF (1C A OR N + $$GENDER); # Postnominal determiner disambiguation SELECT:PossNum Sg IF (0 Poss)(-1C N + Sg); SELECT:PossNum Pl IF (0 Poss)(-1C N + Pl); REMOVE:r8b Neu IF (0 Poss + $$NAGD)(-1 NounMscFem + $$NAGD LINK NOT 0 Neu); REMOVE:r9b Msc IF (0 Poss + $$NAGD)(-1 NounFemNeu + $$NAGD LINK NOT 0 Msc); REMOVE:r10b Fem IF (0 Poss + $$NAGD)(-1 NounMscNeu + $$NAGD LINK NOT 0 Fem); #REMOVE:r11b Neu IF (0 A + $$NAGD)(-1 NounMscFem + $$NAGD LINK NOT 0 Neu); #REMOVE:r12b Msc IF (0 A + $$NAGD)(-1 NounFemNeu + $$NAGD LINK NOT 0 Msc); #REMOVE:r13b Fem IF (0 A + $$NAGD)(-1 NounMscNeu + $$NAGD LINK NOT 0 Fem); # Definiteness disambiguation REMOVE Def IF (0 A + Indef + $$NAGD)(1C N + Indef + $$NAGD); REMOVE Indef IF (0 A + Def + $$NAGD)(1C N + Def + $$NAGD); # Case disambiguation SELECT:r14 $$NAGD IF (0 Det)(NOT 0 Poss LINK -1 N)(*1C $$NAGD BARRIER NPNHA); REMOVE:r15 $$NAGD IF (0 Det)(NOT 0 Poss LINK -1 N)(*1 N OR A BARRIER NPNHA LINK NOT 0 $$NAGD); SELECT:r16 $$NAGD IF (0 Det)(NOT 0 Poss LINK -1 N)(*1C $$NAGD BARRIER NPNHA); REMOVE:r17 $$NAGD IF (0 Det)(NOT 0 Poss LINK -1 N)(*1 N OR A BARRIER NPNHA LINK NOT 0 $$NAGD); REMOVE:AdvErTad Acc IF (-2 Adv)(-1 VFIN)(0 ("tað"))(1 Adv); # Noun disambiguation # ------------------- SELECT:r28 $$GENDER IF (-1C (A) + $$GENDER)(0 N); SELECT:r30b $$GENDER IF (-1C (Det) + $$GENDER)(0 N); SELECT:r30c $$GENDER IF (-1C (Pron Indef) + $$GENDER)(0 N); SELECT:r29 $$NAGD IF (-1C (A) + $$NAGD)(0 N); SELECT:r30b $$NAGD IF (-1C (Det) + $$NAGD)(0 N); SELECT:r30c $$NAGD IF (-1C (Pron Indef) + $$NAGD)(0 N); SELECT:r30 $$NUMBER IF (-1C (A) + $$NUMBER)(0 N); SELECT:r30b $$NUMBER IF (-1C (Det) + $$NUMBER)(0 N); SELECT:r30c $$NUMBER IF (-1C (Pron Indef) + $$NUMBER)(0 N); # Poss disambiguation REMOVE:r31 $$GENDER IF (-1 N LINK NOT 0 $$GENDER)(0 Poss); REMOVE:r32 $$GENDER IF (0 Poss)(*1 N OR A BARRIER NPNHA LINK NOT 0 $$GENDER); SELECT:DisPostPoss Poss IF (-1 N + $$GENDER + $$NAGD)(0 $$GENDER + $$NAGD); #(NOT 1 ...) # Number disambiguation REMOVE:r33 Pl IF (*-1 ("eitt" Num Sg) OR (Det Sg) BARRIER NOT-A)(0 (N Sg)); # Here, we need vislcg3 and variable notation! REMOVE:PlPl (N Sg) IF (0 (N Pl))(-1 Pl)(NOT -1 Sg); SELECT:NumNPl (N Pl) IF (-1 NUMBERS); SELECT:BarePl (N Pl) IF (NOT -1 PRE-N)(0 BAREPLURALS); # Coordination REMOVE Def IF (0 N + Indef)(1 CC)(2C N + Indef); REMOVE Indef IF (0 N + Def) (1 CC)(2C N + Def); SELECT:UniformCC N + $$NAGD IF (*-1 CC BARRIER NPNH LINK -1 N + $$NAGD); SELECT $$NAGD IF (1 CC)(2C $$NAGD); SELECT $$NAGD IF (-11 CC)(-12C $$NAGD); # NP head disambiguation # ====================== # Inversion REMOVE:r34 Acc IF (-2 CS)(-1 (V Pl))(0 (Pl Nom)) ; ## Tað var, sum váru tey sett út úr luftini. # Elliptic AP as NP # ================= SELECT:DefectAP Msc IF (0C REALADJ)(1 S-BOUNDARY) ; # P chains or not SELECT:r45 Adv IF (0 Pr)(1C Pr); ## Hann fór niðan um Danmark. # Pronoun disambiguation REMOVE:r46 Det IF (0 Pers)(1C V OR CLB OR Adv); ## Tey vóru so hugnalig. REMOVE:r46 Dem IF (0 Pers)(1C V OR CLB OR Adv); ## at geva teimum, tí at... REMOVE:r47 Det IF (0 Pers)(NEGATE *1 Def OR ("báðir") BARRIER NPNH)(NEGATE 0 Gen LINK -1 N); ## # teimum báðum, and probably more Det + Pron #REMOVE:annar Pron IF (0 ("annar" $$NUMBER))(1 (N $$NUMBER)); #REMOVE:annar ("annar" Pron $$NAGD) IF (1 (N $$NAGD)); REMOVE:annar ("annar" Pron) + $$NAGD IF (1 N + $$NAGD); #NP Coordination REMOVE:CCPrs Prs + 3Sg IF (0 Pl + Nom)(1 CC)(*2 Pl + Nom BARRIER NPNH OR OBL); # VP disambiguation # ================= # V or A REMOVE:r48 A IF (0 ("vera"))(1 Sg); ## Enskt mál er móðurmál. # This is probably too strict, cf. # Det er ein ljos og var blå farge (var = forsiktig) # Waiting for real examples... SELECT:r49 ("vera" 3Sg) IF (-1 ("tað" Pron Pers Sg Nom)); ## Tað var, sum váru tey sett av tilvild av einum fóti út úr luftini. REMOVE:PrtNotAdj A IF (-1 NOMINALHEAD + Sg + Nom)(0 (V Prt Sg) OR (V Prt 3Sg) OR (V Prt 1Sg)) ; REMOVE:ErtTu ("erur") + Adv IF (*0 ("tú")); #REMOVE # Infinitive SELECT:r51 Inf IF (*-1 MODV BARRIER V); REMOVE (V Pl) IF (-2 VFIN + Sg)(-1 (Pron Sg))(0 Inf); REMOVE (V Pl) IF (-1 N + Sg LINK *-1 VFIN + Sg BARRIER NPNH)(0 Inf); #REMOVE Inf IF (-1 (Pl Nom))(0 (V Pl))(NOT -1 Acc LINK *-2 VFIN BARRIER NPNHA OR CLB); REMOVE Inf IF (-1 (Pl Nom))(0 (V Pl))(NOT -1 Acc)(NOT -2 VFIN); # Imperative # The best would be to make a corpus of imperative sentences, identify # all the imperatives, and then just remove the rest. # REMOVE:ImpFirst N IF (-1 BOS OR PUNCT)(0 Imp)(*1 ("tín" Poss) BARRIER VFIN); # # REMOVE Imp IF (-1C Ind OR PrfPrc OR Inf OR PrsPrc OR Sup OR Refl); # ## Gu∂ gjør∂i hvalvi∂. # REMOVE Imp IF (-1 COMMA)(-2C Ind OR PrfPrc OR Inf OR PrsPrc OR Sup OR Refl); # # REMOVE Imp IF (-1 CS)(0 Ind OR Sbj); # # REMOVE Imp IF (-1 Adv OR Nom OR Pr OR CS OR V)(0 Prs OR Prt); # This rule was commented out, but seems to be sensible, after all. # Here come all rules selecting Imp. SELECT:CoordImp Imp IF (*-1 CC OR COMMA BARRIER VFIN LINK *-1C Imp BARRIER VFIN); # Then we remove the remaining ones. REMOVE Imp ; #REMOVE:r52 Imp IF (NOT *-1 CC OR COMMA LINK *-1 Imp)(NOT *-1 BOS OR PUNCT BARRIER WORD); # hmm REMOVE Sup IF (*-1 CLB BARRIER WORD)(0 Imp)(*1 CLB BARRIER Ind); #REMOVE:r53 Imp IF (0 N)(1 CC)(*2 N BARRIER NPNHA); ## Stýrið og stjórnin hjá... #REMOVE:r54 (Imp Pl) IF (0 (N Neu))(1 VFIN); ## Húsið er stórt. # Present participle # Supine #SELECT:r55 Sup IF (*-1 ("fáa" V) OR ("hava" V Ind) BARRIER NOT-ADV); #check REMOVE:r56 Sup IF (0 Ind)(NOT *-1C VFIN BARRIER S-BOUNDARY)(NOT *1C VFIN BARRIER S-BOUNDARY); REMOVE:SFregel N IF (0 Sup)(1 VFIN); # Present singular SELECT:r57 (V Ind 3Sg) IF (-1 (N Prop Nom) OR (N Sg Nom) OR (3PRON Pers Sg Nom) LINK NOT *-1 CC BARRIER NPNHA)(1 (Pron Refl Acc)) ; REMOVE:test1 (V Ind 1Sg) IF (NEGATE 0* ("eg" Pron Pers Sg Nom)); REMOVE:test2 (V Ind 2Sg) IF (NEGATE 0* ("tú" Pron Pers Sg Nom)); # Tino! REMOVE (V Ind 2Sg) IF (*-1 CLB LINK *-1 2Sg OR Prt + Sg LINK -1 ("tú" Pron Pers Sg Nom) LINK -1 ("sum") OR ("ið")); SELECT 1Sg IF (-1 ("eg" Pron Pers Sg Nom)); REMOVE:r58b (V Ind 1Sg) IF (*-1 (N Sg Nom) BARRIER NOT-ADV LINK NOT *1 Pron + 1Sg BARRIER CLB) ; SELECT:r59 (V Ind 3Sg) IF (-1 (N Prop Nom) OR (N Sg Nom) LINK NOT *-1 CC BARRIER NPNHA OR S-BOUNDARY) ; REMOVE 1Sg IF (-1 V)(0 Acc); ## "at fáa høvi at..." SELECT:2sg 2Sg IF (1 ("tú" Pron Pers Sg Nom)); SELECT:2sg 2Sg IF (-1 ("tú" Pron Pers Sg Nom)); REMOVE:CoordVerb (V Ind 1Sg) IF (-1 CS OR CC LINK *-1C (V Ind Sg) OR (V Ind 3Sg) BARRIER 1Sg); ## Hann legði høkuna á knappin og læt eygað hvíla. SELECT:r60 (V 2Sg) IF (*-1 ("tú" Pron Pers Sg Nom) BARRIER NOT-ADV); SELECT:r60b (V 2Sg) IF (-1 ("og") LINK *-1C (V 2Sg) BARRIER S-BOUNDARY); # Present plural SELECT:r61 (V Pl) IF (-1 (N Pl Nom) OR ("eg" Pron Pers Pl Nom))(NOT -1 (Acc) LINK *-1 VFIN BARRIER NPNH); # V + Refl SELECT:r62 ("seg" Refl) IF (-1 Inf OR 3Sg OR Pl) ; # Nominative # ========== #REMOVE Acc IF (-1 CS OR BOS)(0 Nom)(1 VFIN OR Adv)(*2 S-BOUNDARY OR CLB OR ("sum") BARRIER NOMINALHEAD + Nom); # hmm # Fjallið lá stilt, tað ikki so mikið sum andi. REMOVE:NotTopAcc Acc IF (*-1 S-BOUNDARY BARRIER NPNH)(*1 ("vera") BARRIER V)(NEGATE *1 SOMEACCPREP LINK 1 EOS); REMOVE:NotTopAcc2 Acc IF (*-1 S-BOUNDARY BARRIER NPNH)(0 Nom + Pl)(1 V + Pl)(NOT 2 Nom + Pl); # Accusative # ========== REMOVE:r63 Nom IF (*-1 AUX OR MODV BARRIER NPNH LINK -1 CC OR Nom)(1 Inf)(NOT *1 Pr LINK 1 BOS); REMOVE:AdInfAcc Nom IF (*-1 Inf BARRIER NPNH LINK -1 IM); # Genitive # ======== # Genitive is marginal in Faroese. The strategy should be # 1. Write rules selecting genitive. (such rules are still forthcoming) SELECT:TILRULE Gen IF (-1 ("til"))(0 (N Indef) OR (Prop) OR Pron); SELECT:r64 Gen IF (*-1 ACCGENPREP BARRIER NPNHA)(NOT 0 Acc); # 2. Remove the remaining genitives. REMOVE:r65 Gen IF (NOT *-1 ACCGENPREP BARRIER NPNHA); # Pronoun disambiguation # ======================= REMOVE Det IF (0 Pron)(NOT 1 A OR N OR ("báðir")); # More pronouns in here: teimum báðum REMOVE ("hon") IF (*-1 (Imp Sg) BARRIER ("tú")); ## Tak við tær barnið og flýggja til Egyptalands. REMOVE:r66 ("hannur") IF (0 ("hann" Nom))(1 (V 3Sg)); ## Han var her. SELECT (Pron Pers Pl Nom) IF (1 ("sum"))(2 Pl); REMOVE:r66b (Pl Nom Indef) IF (1 (N Sg) OR (V Sg) OR (V 3Sg))(NOT 1 (V Pl)); REMOVE ("vit") IF (*-1 V + Pl BARRIER V)(0 (Pron Pers Pl Nom)); ## Eru vit helst... LIST AccNom = Nom Acc ; SELECT:PersAgreem $$AccNom IF (0 Pers LINK 1 V LINK *1 $$AccNom + N ) ; # experiment, look at this #$ Hann fór niðan um bøgarðarnar. # Verb disambiguation # =================== SELECT:2Sg 2Sg IF (-1C ("tú" Pron Nom)); ## Tú ert. SELECT:1Sg 1Sg IF (-1C Adv)(0C V)(1 ("eg" Pron Nom))(NOT 2 CC); ## Fyrst vil eg, ... REMOVE N IF (-1 ("sum" CS))(0 INDSBJ)(NOT 1 INDSBJ); # VP disambiguation REMOVE:r113 V IF (-1 (Det Nom))(0 (N Nom)); # Number disambiguation #SELECT $$NUMBER IF (0 A)(1 N + $$NUMBER); REMOVE:r23 (A Sg) IF (1C N + Pl); REMOVE:r23b (A Sg) IF (1 CC)(2 A + Pl)(3C N + Pl); REMOVE:r24 (A Pl) IF (1C N + Sg); ## í ta døkku moldina. REMOVE (N Pl) IF (*-1C (Det Sg) BARRIER NPNH) (0 (N Sg)); SELECT:r25 $$NUMBER IF (0 Det)(*1 N + $$NUMBER OR A + $$NUMBER BARRIER NOT-A); SELECT (Num Nom) IF (-1 BOS)(1 EOS); # Postverbal subject SELECT:PostverbalSubj (Pl Nom) IF (-1 V + Pl)(-2 Adv OR Acc); # Gender disamb of numerals SELECT:r26 $$GENDER IF (0 Num)(*1C N + $$GENDER BARRIER NOT-A); ## eit sindur # Case disamb of numerals SELECT:r27 $$NAGD IF (0 Num)(*1C N + $$NAGD BARRIER NOT-A); # Perhaps also rules that remove Neu if Msc or Fem, etc. # Ordinals REMOVE:r114 (A Sg) IF (1 (Num Sg)); ## fyrstu tríggjar mánðirnar. # Coordination SUBSTITUTE:einsog (CC) (CS) TARGET ("og") IF (-1 ("ein" Det Neu Sg Gen)); SELECT Prt IF (NOT 0 OBL LINK *-1 Pr BARRIER NPNH)(-2C Prt)(-1 CC); SELECT Prs IF (NOT 0 OBL LINK *-1 Pr BARRIER NPNH)(-2C Prs)(-1 CC); SELECT Imp IF (NOT 0 OBL LINK *-1 Pr BARRIER NPNH)(-2C Imp)(-1 CC); SELECT Prt IF (NOT 0 OBL LINK *-1 Pr BARRIER NPNH)(2C Prt)(1 CC); SELECT Prs IF (NOT 0 OBL LINK *-1 Pr BARRIER NPNH)(2C Prs)(1 CC); SELECT Imp IF (NOT 0 OBL LINK *-1 Pr BARRIER NPNH)(2C Imp)(1 CC); SELECT A IF (0 (A $$GENDER $$NAGD))(-1 CC)(-2 (A $$GENDER $$NAGD)); #SELECT:AlwaysVfin VFIN IF (NOT 0* VFIN BARRIER CLB); # Tino! ## Hugur hansara føldist stórur og heitur , fylti alt upp inni í honum og bara hvíldi í sær sjálvum . ## Læt ymist mala fram fyri eina løtu , til tað hevði tømt seg , fyri so at fara burtur av sær sjálvum # The first sentence indicates the 0* does not work. # The second sentence indicates the rule is too strict. # Substituting tags SUBSTITUTE:sumPr (CS) (Pr) TARGET ("sum") (NOT -1 BOS OR COMMA); # CC Coordinate NPs SELECT (V Pl) IF (-1 NOMINALHEAD + Nom LINK *-1 CC BARRIER NPNH LINK -1 NOMINALHEAD + Nom); REMOVE:Dangerous (V Ind 2Sg) IF (0 (V Ind 3Sg))(NOT 0 Pass); # AFTER-SECTIONS ## For Apertium REMOVE SUB:1 (cmp) ; REMOVE (adj cmp) IF (0 (adj)) ; #REMOVE A + Cmp IF (0 A + $$NAGD + $$GENDER + $$NUMBER) (0 A + Cmp + $$NAGD + $$GENDER + $$NUMBER) ;