# # first real test grammar for gerund detection # # ATTENTION: uses Lingpipe interpretation of Brown/Penn Tagset # # ATTENTION: sets and templates with underscores are meant to be used # globally in the entire section they are specified in. # ############################################################################# # # Common definitions # ############################################################################# #----- HANDLING PUNCTUATION PROPERLY ---------------------------------------- # TODO: abbreviations are broken. DELIMITERS = "" ; # sentence window #----- LEXICAL BITS AND PIECES ---------------------------------------------- # fixed expressions that preceed the gerund TEMPLATE _FIXED_EXPRESSIONS = (0 (""ir) LINK +1 ("") LINK +1 ("")) OR (0 (""ir) LINK +1 ("")) OR (0 (""ir) LINK +1 ("")) #OR #(0 (""ir)) #OR #(0 (""ir)) #OR #(0 (""ir)) ; TEMPLATE _FIXED_EXPR_TOINF = (0 (""ir) LINK +1 ("") LINK +1 ("")) OR (0 (""ir) LINK +1 ("") LINK +1 ("")) OR (0 (""ir) LINK +1 ("")) ; # verbs LIST _LEX_ALWAYS_GERUND = ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r "" "" ""r ""r ""r ""r "" ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r "" ; LIST _LEX_GERUND_OR_TOINF_MEANING_CHANGE = ""r "" "" ""r ""r ""r ""r ""r "" "" "" ; LIST _LEX_GERUND_OR_TOINF_NO_MEANING_CHANGE = "" "" "" ""r ""r ""r ""r ""r ""r ""r ; LIST _LEX_ALWAYS_TOINF = ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ""r ; #nouns #noun triggers with frequency > 100 LIST _NOUN_ALWAYS_TOINF = ""r ""r ""r "" "" "" "" "" "" "" "" "" ""r "" "" "" "" "" "" "" "" ""r "" ""r "" "" "" "" "" ; #nouns for which we are not sure if they make good triggers, therefore #we only include them in the colorize activity (Brown_Conll and English page lists) LIST _NOUN_TOINF_COLORIZE = ""r ""r "" "" "" "" "" "" ""r ""r "" ""r "" ""r "" "" "" ""r ""r "" ""r ""r "" "" ""r ""r "" ; SET _LEX_GERUND_OR_TOINF = _LEX_GERUND_OR_TOINF_MEANING_CHANGE OR _LEX_GERUND_OR_TOINF_NO_MEANING_CHANGE; SET _LEX_GERUND_CANDIDATES = _LEX_GERUND_OR_TOINF OR _LEX_ALWAYS_GERUND; #----- GLOBAL ABSTRACT SETS OF POS ------------------------------------------ LIST _INGFORM = VBG HVG BEG ; # gerunds or present participle LIST _INFIN = BE-INF DO-INF HV-INF VB-INF; # to-infinitives (introduced below) LIST _UNINF = BE DO GV VB; # uninflected (infinitives or imperatives without to) LIST _ADVER = RB RBR RBT; # some types of adverbs used with infinitives LIST _NEG = "" ""; # negation words, TODO: contractions?! LIST _ADJEC = JJ JJT JJS JJR; LIST _NOUNS = NN NN$ NNS NNS$ NP NP$ NPS NPS$; LIST _ANYVERB_ANYFORM = VB VBD VBG VBN VBZ BE BED BEDZ BEG BEM BEN BER BEZ DO DOD DOZ HV HVD HVG HVN HVZ MD BE-INF DO-INF HV-INF VB-INF; ############################################################################# # # Preprocessing: fixing tagger issues and introducing ambiguous readings # ############################################################################# BEFORE-SECTIONS # repair mistagged "to" before two or more adverbs "" SUBSTITUTE (IN) (TO) (IN) ( +1*C _UNINF BARRIER (*) - _ADVER ); "" SUBSTITUTE (IN) (TO) (IN) ( +1*C _UNINF BARRIER (*) - _ADVER ); # Introduce a new tag for double quotes. The tagset is missing this one. As the # tagger does not know them, it produces all kinds of funny analyses instead. APPEND ("" QUOT) ("<">"); APPEND ("" QUOT) ("<">"); SELECT (QUOT) (0 (QUOT)); # Simple sentence-initial question forms may have been assigned any # funny tag. APPEND ("" BEDZ) ("") (+1 (PPSS)); APPEND ("" BED) ("")(+1 (PPSS)); APPEND ("" BEDZ) ("") (+1 (PPSS)); APPEND ("" BED) ("") (+1 (PPSS)); # 's contraction is always BEZ if not genitive 's LIST GENITIVE = NN$ DT$ CD$ AP$ RB$ PN$ NR$ NPS$ NP$ NNS$ JJ$; APPEND ("" BEZ) ("") (-1 ("<'>")) (NOT 0 GENITIVE); # introduce infinitive tags (Brown tagset only knows "uninflected forms") SUBSTITUTE (BE) (BE-INF) (BE) (-1*C (TO) BARRIER (*) - _ADVER); SUBSTITUTE (DO) (DO-INF) (DO) (-1*C (TO) BARRIER (*) - _ADVER); SUBSTITUTE (HV) (HV-INF) (HV) (-1*C (TO) BARRIER (*) - _ADVER); SUBSTITUTE (VB) (VB-INF) (VB) (-1*C (TO) BARRIER (*) - _ADVER); # add readings for all -ing forms (this should be done in a smarter way) # SUBSTITUTE ("" VBG) ("---remove---" VBG) (VBG); APPEND ("" VBG GERU) (VBG); APPEND ("" HVG GERU) (HVG); APPEND ("" BEG GERU) (BEG); APPEND ("" VBG PROG) (VBG); APPEND ("" HVG PROG) (HVG); APPEND ("" BEG PROG) (BEG); APPEND ("" VBG PART) (VBG); APPEND ("" HVG PART) (HVG); APPEND ("" BEG PART) (BEG); # remove readings without tags. REMOVE ("---remove---" VBG); # add reading for going-to-future "" APPEND ("" VGB GOFU) (VBG); # add reading for going-to-future in the past "" APPEND ("" VGB GOFUPA) (VBG); # some rare special cases: # gerund-noun ("fishing-pole") seems to be noun compound, the gerund is # assumed to be lexicalized. APPEND ("" NN GLEX) _INGFORM (+1 ("<->")) (+2 _NOUNS); APPEND ("" NN GLEX) _INGFORM (-1 ("<->")) (-2 _NOUNS); SELECT (NN GLEX); # same for adjective compounds ("good-looking") APPEND ("" JJ GLEX) _INGFORM (+1 ("<->")) (+2 _ADJEC); APPEND ("" JJ GLEX) _INGFORM (-1 ("<->")) (-2 _ADJEC); SELECT (JJ GLEX); ############################################################################# # # Rules: disambiguating -ing forms # ############################################################################# SECTION #----- UTILITY TEMPLATES AND LISTS/SETS ------------------------------------- # finite forms of to be, not including imperative LIST _BE_FINITE = BEM BEZ BER BEN BED BEDZ; # determiners and pronouns used in NPs LIST _NP_DETPRON = DT DTS DTI AT PP$ PP$$ PPL PPLS PPO PPS PPSS; # these are used as clause delimiters LIST _COMMA = "<,>" "<;>"; SET _COMMA_OR_SUBORD = _COMMA OR (CS); #----- GOING-TO FUTURE AND FUTURE-IN-THE-PAST ------------------------------- # forms of "to be" used in going-to future (in-the-past) LIST GOFU_BE = BER BEM BEZ; LIST GOFUPA_BE = BED BEDZ; # simple going-to-future # sentences SELECT (GOFU) (-1* GOFU_BE BARRIER (*) - _NEG - _ADVER) (+1 (TO)) (+2 _INFIN); # qestions SELECT (GOFU) (-1* GOFU_BE BARRIER _ANYVERB_ANYFORM) (+1 (TO)) (+2 _INFIN) (+3* ("")); # going-to-future in the past SELECT (GOFUPA) (-1* GOFUPA_BE BARRIER (*) - _NEG - _ADVER) (+1 (TO)) (+2 _INFIN); # question SELECT (GOFUPA) (-1* GOFUPA_BE BARRIER _ANYVERB_ANYFORM) (+1 (TO)) (+2 _INFIN) (+3* ("")); #----- OTHER PROGRESSIVE FORM READINGS -------------------------------------- # sentence SELECT (PROG) (-1* _BE_FINITE BARRIER (*) - _NEG - _ADVER); # question SELECT (PROG) (-1* _BE_FINITE BARRIER _ANYVERB_ANYFORM) (+1* ("")); #----- PARTICIPLE READINGS -------------------------------------------------- # "... the (not) very brightly shining gem ..." SELECT (PART) (-1* _NP_DETPRON BARRIER (*) - _ADVER - _NEG - (QL)) (+1 _NOUNS); # "... you racing ..." # "... of him having stolen ..." # "... herself wondering how ..." SELECT (PART) (-1 (PPO)); SELECT (PART) (-1 (PPL)); # Participle at the sentence beginning. # TODO: does this quite sledgehammery rule overgenerate?? SELECT (PART) (-1 (>>>)) (+1* _COMMA_OR_SUBORD BARRIER _ANYVERB_ANYFORM ); # parenthesis # "... , Peter talking loudly to the mirror, ..." # TODO: this does overgenerate! SELECT (PART) (-1* _COMMA BARRIER _ANYVERB_ANYFORM ) (+1* _COMMA_OR_SUBORD BARRIER _ANYVERB_ANYFORM ); # "(he said),turning to her:" SELECT (PART) (-1* _COMMA BARRIER _ANYVERB_ANYFORM ) (+1* (<<<) BARRIER _ANYVERB_ANYFORM ); # "..., not noticing that..." # TODO: this does overgenerate! SELECT (PART) (-1* _COMMA BARRIER _ANYVERB_ANYFORM ) (NOT -1* (IN) BARRIER_COMMA ) ((+1* _COMMA_OR_SUBORD BARRIER _ANYVERB_ANYFORM ) OR (+1* (<<<) BARRIER _ANYVERB_ANYFORM)) ; #----- GERUND READINGS ------------------------------------------------------ ### treat lexical sure-fire verbs # without subject SELECT (GERU) (-1 _LEX_GERUND_CANDIDATES LINK 0 _ANYVERB_ANYFORM ); # and with a subject # TODO: does this overgenerate? SELECT (GERU) (-1 _NOUNS LINK -1 _LEX_GERUND_CANDIDATES LINK 0 _ANYVERB_ANYFORM ); SELECT (GERU) (-1 (PPO) LINK -1 _LEX_GERUND_CANDIDATES LINK 0 _ANYVERB_ANYFORM ); SELECT (GERU) (-1 (PP$) LINK -1 _LEX_GERUND_CANDIDATES LINK 0 _ANYVERB_ANYFORM ); ### lexical sure-fire fixed expressions #SELECT (GERU) (T:_FIXED_EXPRESSIONS); SELECT (GERU) (-1< ("") LINK (T:_FIXED_EXPRESSIONS) LINK +1 (GERU)); SELECT (GERU) (-2< ("") LINK (T:_FIXED_EXPRESSIONS) LINK +1 (GERU)); SELECT (GERU) (-3< ("") LINK (T:_FIXED_EXPRESSIONS) LINK +1 (GERU)); SELECT (GERU) (-4< ("") LINK (T:_FIXED_EXPRESSIONS) LINK +1 (GERU)); ### treat the rest # simple rules that should be not too dangerous, obtained from simple corpus # observations SELECT (GERU) (-2 _ADVER) (-1 (IN)); SELECT (GERU) (-2 _ADVER) (-1 (CS)); SELECT (GERU) (-2 _ADJEC) (-1 (IN)); SELECT (GERU) (-2 _ADJEC) (-1 (CS)); # PP object of a noun # "... plan for getting out ..." # "... of his (not) really having stolen ..." SELECT (GERU) (-1* _NOUNS BARRIER (*) - _ADVER - _NEG - (QL) LINK -1 (IN)); # "... your (terrible) racing ..." SELECT (GERU) (-1* (PP$) BARRIER (*) - _ADJEC - (QL) ); # very simple sleghemmer rules that catch most cases. # TODO: this overgenerates too much #SELECT (GERU) (-1 (IN)); #SELECT (GERU) (-1 (CS)); # gerund (with own arguments) as subject argument to the verb in 3rd person # and/or past tense or will-future (with optional adverbs) # "... playing playfully is awesome ... " # "... playing the guitar was what he did ... " LIST _3P_PRES = VBZ BEZ DOZ VHZ; LIST _3P_PAST = BEDZ DOD HVD VBD; SELECT (GERU) ( +1* _3P_PRES BARRIER _COMMA_OR_SUBORD OR _ANYVERB_ANYFORM); SELECT (GERU) ( +1* _3P_PAST BARRIER _COMMA_OR_SUBORD OR _ANYVERB_ANYFORM); SELECT (GERU) ( +1* (MD) BARRIER _COMMA_OR_SUBORD OR _ANYVERB_ANYFORM LINK +1 _UNINF); # gerund as the object of a preposition # "He insisted on telling us the truth." # "He prevented the sun from shining." SELECT (GERU) (-1 (IN)) (-2* _ANYVERB_ANYFORM BARRIER _COMMA); # 'predicative gerund' such as "Being very good (is incredibly bad.)" SELECT (GERU) (0 (BEG)) (+1* _ADVER BARRIER (*) - (QL) ); ############################################################################# # # After inferring the required information: marking of the chunks # ############################################################################# AFTER-SECTIONS # ATTENTION: since we do not want ambigous annotations to be marked, # every constraint here must be careful (marked with C). This will # ommit ambigous readings. # marking infinitive constructions: TO ADVER* INFIN # INF-B: beginning of chunk. INF-I: inside of chunk. ADD (INF-B) (TO) (+*1C _INFIN BARRIER (*) - _ADVER ); ADD (INF-I) _INFIN (-*1C (TO) BARRIER (*) - _ADVER ); ADD (INF-I) _ADVER (-*1C (TO) BARRIER (*) - _ADVER ) (+*1C _INFIN BARRIER (*) - _ADVER ); ADD (INFSPLIT-B) (INF-I) (0 _ADVER) (-1 (TO)); ADD (INFSPLIT-I) (INF-I) (0 _ADVER) (NOT0 (INFSPLIT-B)); # marking gerund constructions ADD (GER-B) (GERU) (0C (GERU)); # these are used for the debug option of showing all known ingforms ADD (GOI-B) (GOFU) (0C (GOFU)); ADD (GOP-B) (GOFUPA) (0C (GOFUPA)); ADD (PRO-B) (PROG) (0C (PROG)); ADD (PAR-B) (PART) (0C (PART)); # and this marks the ones which are still ambiguous, ... APPEND ("" AMB-B) (GERU) (NOT 0 (GER-B)); APPEND ("" AMB-B) (PROG) (NOT 0 (PRO-B)) (NOT 0 (AMB-B)); APPEND ("" AMB-B) (PART) (NOT 0 (PAR-B)) (NOT 0 (AMB-B)); APPEND ("" AMB-B) (GOFU) (NOT 0 (GOI-B)) (NOT 0 (AMB-B)); APPEND ("" AMB-B) (GOFUPA) (NOT 0 (GOP-B)) (NOT 0 (AMB-B)); # ... and then delete the rest (better comment out this during grammar development!) SELECT (AMB-B) (0 (AMB-B)); ############################################################################# # # Finding clue phrases and annotating them as chunks. # ############################################################################# AFTER-SECTIONS # these are the begin-tags of all types of clues there are LIST _ALLCUETYPES_B = CLU-GERONLY-B CLU-INFONLY-B CLU-BOTHMEANDIFF-B CLU-BOTHMEANSAME-B CLU-FIXEDEXP-B CLU-INFONLY-COLORIZE-B; LIST _ALLCUETYPES_I = CLU-GERONLY-I CLU-INFONLY-I CLU-BOTHMEANDIFF-I CLU-BOTHMEANSAME-I CLU-FIXEDEXP-I CLU-INFONLY-COLORIZE-I; SET _ALLCLUETYPES = _ALLCUETYPES_B OR _ALLCUETYPES_I; LIST _INFCHUNKS = INF-I INF-B; # fixed expression clues # TODO: have something between clue and gerund? ADD (CLU-FIXEDEXP-B) ("") ( T:_FIXED_EXPRESSIONS LINK +1 (GER-B) ); ADD (CLU-FIXEDEXP-I) ("") (-1* (CLU-FIXEDEXP-B)) (+1* (GER-B)); ADD (CLU-FIXEDEXP-B) ("") (T:_FIXED_EXPR_TOINF LINK +1 (INF-B) ); ADD (CLU-FIXEDEXP-I) ("") (-1* (CLU-FIXEDEXP-B)) (+1* (INF-B)); # ATTENTION: it is necessary to ensure leaving alone things that are fixed expression # clues in the rest of this grammar section. # marking up lexical clues # - be carefull not to overwrite clues and not to place clues inside infinitives SET GERCLUBLOCK = _ALLCLUETYPES OR _INFCHUNKS; # those with gerunds... ADD (CLU-GERONLY-B) _LEX_ALWAYS_GERUND (+1* (GER-B) BARRIER _ANYVERB_ANYFORM OR _COMMA ) (0 _ANYVERB_ANYFORM ) (NOT 0 GERCLUBLOCK) ; ADD (CLU-BOTHMEANDIFF-B) _LEX_GERUND_OR_TOINF_MEANING_CHANGE (+1* (GER-B) BARRIER _ANYVERB_ANYFORM OR _COMMA ) (0 _ANYVERB_ANYFORM ) (NOT 0 GERCLUBLOCK); ADD (CLU-BOTHMEANSAME-B) _LEX_GERUND_OR_TOINF_NO_MEANING_CHANGE (+1* (GER-B) BARRIER _ANYVERB_ANYFORM OR _COMMA ) (0 _ANYVERB_ANYFORM ) (NOT 0 GERCLUBLOCK); # ... and those with infinitives ADD (CLU-INFONLY-B) _LEX_ALWAYS_TOINF (+1* (INF-B) BARRIER _ANYVERB_ANYFORM OR _COMMA ) (0 _ANYVERB_ANYFORM ) (NOT 0 GERCLUBLOCK); ADD (CLU-BOTHMEANDIFF-B) _LEX_GERUND_OR_TOINF_MEANING_CHANGE (+1* (INF-B) BARRIER _ANYVERB_ANYFORM OR _COMMA ) (0 _ANYVERB_ANYFORM ) (NOT 0 GERCLUBLOCK); ADD (CLU-BOTHMEANSAME-B) _LEX_GERUND_OR_TOINF_NO_MEANING_CHANGE (+1* (INF-B) BARRIER _ANYVERB_ANYFORM OR _COMMA ) (0 _ANYVERB_ANYFORM ) (NOT 0 GERCLUBLOCK); # Some noun triggers ADD (CLU-INFONLY-B) _NOUN_ALWAYS_TOINF (0 _NOUNS) (+1 (INF-B)) (NOT 0 GERCLUBLOCK); ADD (CLU-INFONLY-COLORIZE-B) _NOUN_TOINF_COLORIZE (0 _NOUNS) (+1 (INF-B)) (NOT 0 GERCLUBLOCK); # marking up all gerunds preceding prepositions # TODO: is this what the learner wants to see? ADD (CLU-GERONLY-B) (IN) (+1C (GER-B)) (NOT 0 _ALLCLUETYPES); ############################################################################# # # Marking relevant text strands that include a clue phrase + gerund or inf # as RELEVANT chunks. # ############################################################################# AFTER-SECTIONS ### ad marker for relevant text spans including clue phrases and gerunds ADD (RELEVANT-B) ("") (0C _ALLCUETYPES_B) (+1*C (GER-B) BARRIER _INFCHUNKS ); ADD (RELEVANT-I) ("") (-1*C (RELEVANT-B) BARRIER (GER-B) OR (INF-B)) (+1*C (GER-B) BARRIER _INFCHUNKS ); ADD (RELEVANT-I) ("") (-1 (RELEVANT-I)) (0C (GER-B)); ADD (RELEVANT-I) ("") (-1 (RELEVANT-B)) (0C (GER-B)); ### ad marker for relevant text spans including clue phrases and infinitives ADD (RELEVANT-B) ("") (0C _ALLCUETYPES_B) (+1*C (INF-B) BARRIER (GER-B) ); ADD (RELEVANT-I) ("") (-1*C (RELEVANT-B) BARRIER (GER-B) OR (INF-B) ) (+1*C (INF-B) BARRIER (GER-B) ); ADD (RELEVANT-I) ("") (-1 (RELEVANT-I)) (0C (INF-B)); ADD (RELEVANT-I) ("") (-1 (RELEVANT-B)) (0C (INF-B)); ADD (RELEVANT-I) ("") (-1 (RELEVANT-I)) (0C (INF-I)); ADD (RELEVANT-I) ("") (-1 (RELEVANT-B)) (0C (INF-I));