# This is a makefile that builds the Northern Sami morphological parser # ***************************************************************** # =============================== # # Variable definitions # # =============================== # # Tools used when compiling the transducers XFST = xfst -utf8 TWOLC = twolc -utf8 LEXC = lexc -utf8 SOURCEFILES = sme-lex.txt adj-sme-lex.txt \ adv-sme-lex.txt noun-sme-lex.txt verb-sme-lex.txt \ closed-sme-lex.txt pp-sme-lex.txt acro-sme-lex.txt \ abbr-sme-lex.txt propernoun-sme-lex.txt punct-sme-lex.txt NONRECFILES := $(patsubst %.txt,../int/%.nonrec,$(SOURCEFILES)) # Added for historical purposes all: @echo "Don't use this Makefile, cd $$GTHOME/gt/ ; make hfst GTLANG=sme" # =============================== # # Building the last file isme.fst # # =============================== # # The ultimate goal is to build isme.fst, the generator # This goal depends on sme.save being up to date. isme.fst: ../bin/isme.fst ../bin/isme.fst: ../bin/pos-sme.fst ../bin/n-sme.fst ../bin/s-sme.fst \ ../bin/sme.save ../int/allcaps.fst ../bin/n-sme.fst \ ../int/caseconv.fst ../bin/d-sme.fst ../bin/guess-sme \ ../bin/abbr.txt ../bin/cap-sme ../bin/foreign.fst ../bin/typos.fst \ ../int/typoslist.txt ../int/tag-not-save.fst ../bin/missing \ ../int/nonrec-sme.save @echo @echo "*** Building the inverse isme.fst ***" @echo @printf "read regex [[@\"../int/tag-not-save.fst\"] .o. \ [@\"../bin/sme.save\"]] ; \n\ invert net \n\ save stack ../bin/isme.fst \n\ quit \n" > ../../tmp/isme-fst-script $(XFST) < ../../tmp/isme-fst-script @rm -f ../../tmp/isme-fst-script # @printf "read regex [[0 <- %+TV, 0 <- %+IV ] .o. @\"../bin/sme.save\"] ; \n\ # invert net \n\ # save stack ../bin/isme.fst \n\ # quit \n" > ../../tmp/isme-fst-script # $(XFST) < ../../tmp/isme-fst-script # @rm -f ../../tmp/isme-fst-script # Original: # @printf "load ../bin/sme.save \n\ # invert net \n\ # save stack ../bin/isme.fst \n\ # quit \n" > ../../tmp/isme-fst-script # $(XFST) < ../../tmp/isme-fst-script # @rm -f ../../tmp/isme-fst-script # ======================================================= # # Building different versions of the basic sme.fst tagger # # ======================================================= # # We want an analyzer with POS tags only. It takes the linguistic # sme.fst as input and gives us an alternate pos-sme.fst. pos-sme.fst: ../bin/pos-sme.fst ../bin/pos-sme.fst: ../bin/sme.fst ../int/tag-pos.fst @echo @echo "*** Building pos-sme.fst, sme.fst with POS tags ***" @echo @printf "read regex [[@\"../int/tag-pos.fst\"] .o. \ [@\"../bin/sme.fst\"]] ; \n\ save stack ../bin/pos-sme.fst \n\ quit \n" > ../../tmp/pos-sme-fst-script $(XFST) < ../../tmp/pos-sme-fst-script @rm -f ../../tmp/pos-sme-fst-script # In order to make pos-sme.fst we need a binary tag-pos.fst # This goal depends on tag-pos.regex. The way it # is done is that all tags except the POS one are deleted. tag-pos.fst: ../int/tag-pos.fst ../int/tag-pos.fst: tag-pos.regex @echo @echo "*** Building tag-pos.fst ***" ; @echo @printf "read regex < tag-pos.regex \n\ save stack ../int/tag-pos.fst \n\ quit \n" > ../../tmp/sme-tag-pos-script $(XFST) < ../../tmp/sme-tag-pos-script @rm -f ../../tmp/sme-tag-pos-script # We want to delete the +TV +IV tags for the generator (and other # tags later on?. For that we need our tag-deleter. tag-not-save.fst: ../int/tag-not-save.fst ../int/tag-not-save.fst: tag-not-save.regex @echo @echo "*** Building tag-not-save.fst ***" ; @echo @printf "read regex < tag-not-save.regex \n\ save stack ../int/tag-not-save.fst \n\ quit \n" > ../../tmp/sme-tag-not-save-script $(XFST) < ../../tmp/sme-tag-not-save-script @rm -f ../../tmp/sme-tag-not-save-script # We want an analyzer with Norwegian tags. It takes the linguistic # sme.fst as input and gives us an alternate n-sme.fst n-sme.fst: ../bin/n-sme.fst ../bin/n-sme.fst: ../bin/sme.fst ../int/tag-no.fst @echo @echo "*** Building n-sme.fst, sme.fst with Norwegian tags ***" @echo @printf "read regex [[@\"../int/tag-no.fst\"] .o. \ [@\"../bin/sme.fst\"]] ; \n\ save stack ../bin/n-sme.fst \n\ quit \n" > ../../tmp/n-sme-fst-script $(XFST) < ../../tmp/n-sme-fst-script @rm -f ../../tmp/n-sme-fst-script # In order to make n-sme.fst we need a binary tag-no.fst # This goal depends on tag-no.regex tag-no.fst: ../int/tag-no.fst ../int/tag-no.fst: tag-no.regex @echo @echo "*** Building tag-no.fst ***" ; @echo @printf "read regex < tag-no.regex \n\ save stack ../int/tag-no.fst \n\ quit \n" > ../../tmp/sme-tag-no-script $(XFST) < ../../tmp/sme-tag-no-script @rm -f ../../tmp/sme-tag-no-script # We also want an analyzer with Sami tags. It takes the linguistic # sme.fst as input and gives us an alternate s-sme.fst s-sme.fst: ../int/s-sme.fst ../bin/s-sme.fst: ../bin/sme.fst ../int/tag-sme.fst @echo @echo "*** Building s-sme.fst, sme.fst with Sami tags ***" @echo @printf "read regex [[@\"../int/tag-sme.fst\"] .o. \ [@\"../bin/sme.fst\"]] ; \n\ save stack ../bin/s-sme.fst \n\ quit \n" > ../../tmp/s-sme-fst-script $(XFST) < ../../tmp/s-sme-fst-script @rm -f ../../tmp/s-sme-fst-script # In order to make s-sme.fst we need a binary tag-no.fst # This goal depends on tag-sme.regex tag-sme.fst: ../int/tag-sme.fst ../int/tag-sme.fst: tag-sme.regex @echo @echo "*** Building tag-sme.fst ***" ; @echo @printf "read regex < tag-sme.regex \n\ save stack ../int/tag-sme.fst \n\ quit \n" > ../../tmp/tag-sme-script $(XFST) < ../../tmp/tag-sme-script @rm -f ../../tmp/tag-sme-script g-sme.fst: ../int/g-sme.fst ../int/g-sme.fst: ../bin/sme.fst guess-script.xfst @echo @echo "*** Building g-sme.fst ***" ; @echo @printf "compile-source sme-lex.txt adj-sme-lex.txt adv-sme-lex.txt \ noun-sme-lex.txt verb-sme-lex.txt closed-sme-lex.txt \ pp-sme-lex.txt acro-sme-lex.txt \ abbr-sme-lex.txt propernoun-sme-lex.txt punct-sme-lex.txt \n\ save-source ../int/g-sme.save \n\ quit \n" > ../../tmp/g-sme-save-script $(LEXC) < ../../tmp/g-sme-save-script @rm -f ../../tmp/g-sme-save-script @printf "source guess-script.xfst \n\ save stack ../int/g-sme.fsm \n\ quit \n" > ../../tmp/guess-sme-script $(XFST) < ../../tmp/guess-sme-script @rm -f ../../tmp/guess-sme-script @printf "read-source ../int/g-sme.fsm \n\ read-rules ../int/twol-sme.bin \n\ compose-result \n\ save-result ../int/g-sme.fst \n\ quit \n" > ../../tmp/g-sme-save-script $(LEXC) < ../../tmp/g-sme-save-script @rm -f ../../tmp/g-sme-save-script @printf "read regex @\"../int/g-sme.fst\" .o. \ @\"../int/caseconv.fst\" ; \n\ save stack ../int/g-sme.fst \n\ quit \n " > ../../tmp/g-sme-fst-script $(XFST) < ../../tmp/g-sme-fst-script @rm -f ../../tmp/g-sme-fst-script BINDIR := $(shell pwd | sed -e s/src/bin/) guess-sme: ../bin/guess-sme ../bin/guess-sme: @echo @echo "*** Generating guess-sme ***" ; @echo @printf "sme ${BINDIR}/sme.fst\n\ guesser ${INTDIR}/g-sme.fst\n\n\ sme \n\ guesser \n" > $@ # Here we build d-sme.fst, the morphological tagger geared towards # disambiguation. d-sme.fst: ../bin/d-sme.fst ../bin/d-sme.fst: ../bin/sme.fst ../int/dis-tag.fst @echo @echo "*** Building d-sme.fst, sme.fst w/ tags for disambiguation ***" @echo @printf "read regex [[@\"../int/dis-tag.fst\"] .o. \ [@\"../bin/sme.fst\"]] ; \n\ save stack ../bin/d-sme.fst \n\ quit \n" > ../../tmp/d-sme-fst-script $(XFST) < ../../tmp/d-sme-fst-script @rm -f ../../tmp/d-sme-fst-script # ================================= # # Building the basic tagger sme.fst # # ================================= # # Here we build sme.fst, the core morphological tagger # It takes sme.save as input and adds the caseconv fst in order to # handle initial capitals. sme.fst: ../bin/sme.fst ../bin/sme.fst: ../bin/sme.save ../int/caseconv.fst \ ../int/allcaps.fst ../bin/abbr.txt ../bin/cap-sme @echo @echo "*** Building sme.fst ***" ; @echo @printf "read regex @\"../bin/sme.save\" .o. \ @\"../int/caseconv.fst\" ; \n\ save stack ../bin/sme.fst \n\ quit \n" > ../../tmp/sme-fst-script $(XFST) < ../../tmp/sme-fst-script @rm -f ../../tmp/sme-fst-script # ================================================= # # Building auxiliary files for case conversion etc. # # ================================================= # # The second goal is to build the caseconv.fst file # This goal depends on case.regex caseconv.fst: ../int/caseconv.fst ../int/caseconv.fst: case.regex @echo @echo "*** Building caseconv.fst ***" ; @echo @printf "read regex < case.regex \n\ save stack ../int/caseconv.fst \n\ quit \n" > ../../tmp/caseconv-sme-script $(XFST) < ../../tmp/caseconv-sme-script @rm -f ../../tmp/caseconv-sme-script # This goal depends on allcaps.regex allcaps.fst: ../int/allcaps.fst ../int/allcaps.fst: allcaps.regex ../bin/cap-sme @echo @echo "*** Building allcaps.fst ***" ; @echo @printf "source allcaps.regex \n\ save stack ../int/allcaps.fst \n\ quit \n" > ../../tmp/allcaps-sme-script $(XFST) < ../../tmp/allcaps-sme-script @rm -f ../../tmp/allcaps-sme-script # ========================================================= # # Building the basic parser from morphophonolgy and lexicon # # ========================================================= # # The third goal is to build sme.save # This goal depends on twol-sme.int and a bunch of lexicon files sme.save: ../bin/sme.save ../bin/sme.save: ../int/twol-sme.bin sme-lex.txt adj-sme-lex.txt \ adv-sme-lex.txt noun-sme-lex.txt verb-sme-lex.txt \ closed-sme-lex.txt pp-sme-lex.txt acro-sme-lex.txt \ abbr-sme-lex.txt propernoun-sme-lex.txt punct-sme-lex.txt @echo @echo "*** Building sme.save ***" ; @echo printf "compile-source sme-lex.txt adj-sme-lex.txt adv-sme-lex.txt \ noun-sme-lex.txt verb-sme-lex.txt closed-sme-lex.txt \ pp-sme-lex.txt acro-sme-lex.txt \ abbr-sme-lex.txt propernoun-sme-lex.txt punct-sme-lex.txt \n\ read-rules ../int/twol-sme.bin \n\ compose-result \n\ save-result ../bin/sme.save \n\ quit \n" > ../../tmp/sme-save-script $(LEXC) < ../../tmp/sme-save-script @rm -f ../../tmp/sme-save-script # We need a variant of this third goal, to build a non-recursive sme.save # This goal depends on twol-sme.bin and a bunch of lexicon files nonrec-sme.save: ../int/nonrec-sme.save ../int/nonrec-sme.save: ../int/twol-sme.bin $(NONRECFILES) @echo @echo "*** Building nonrec-sme.save ***" ; @echo printf "compile-source $(NONRECFILES) \n\ read-rules ../int/twol-sme.bin \n\ compose-result \n\ save-result ../int/nonrec-sme.save \n\ quit \n" > ../../tmp/nonrec-sme-save-script $(LEXC) < ../../tmp/nonrec-sme-save-script @rm -f ../../tmp/nonrec-sme-save-script $(NONRECFILES): $(SOURCEFILES) @echo @echo "*** Removing circular entries ***" ; @echo @grep -v '\^C\^' $(patsubst ../int/%.nonrec,../src/%.txt,$@) > $@ # The fourth goal is to build twol-sme.bin # This goal depends on twol-sme.txt twol-sme.bin: ../int/twol-sme.bin ../int/twol-sme.bin: twol-sme.txt @echo @echo "*** Building twol-sme.bin ***" ; @echo @printf "read-grammar twol-sme.txt \n\ compile \n\ save-binary ../int/twol-sme.bin \n\ quit \n" > ../../tmp/twol-sme-script $(TWOLC) < ../../tmp/twol-sme-script @rm -f ../../tmp/twol-sme-script # =========================== # # Building preprocessor files # # =========================== # # We want to make a parser with tags for parsing. # In order to get that we make a tag modifier dis-tag.fst: ../int/dis-tag.fst ../int/dis-tag.fst: dis-tag.txt @echo @echo "*** Building the tag manipulator dis-tag.fst ***" ; @echo @printf "source dis-tag.txt \n\ save stack ../int/dis-tag.fst \n\ quit \n" > ../../tmp/dis-tag-sme-script $(XFST) < ../../tmp/dis-tag-sme-script @rm -f ../../tmp/dis-tag-sme-script # Here we include a preprocessor. This goal depends on tok.txt # Note! This preprocessor is obsolete, and replaced with preprocess. # Do not use unless we decide to go back to tokenize! tok.fst: ../int/tok.fst ../int/tok.fst: tok.txt @echo @echo "*** Building the tokenizer tok.fst ***" ; @echo @printf "source tok.txt \n\ save stack ../int/tok.fst \n\ quit \n" > ../../tmp/tok-sme-script $(XFST) < ../../tmp/tok-sme-script @rm -f ../../tmp/tok-sme-script # Here we make the abbrevation file for our current preprocessor, # the perl-based preprocess (located in the script catalogue) abbr.txt: ../bin/abbr.txt ../bin/abbr.txt: ../../script/abbr-extract.pl abbr-sme-lex.txt \ propernoun-sme-lex.txt closed-sme-lex.txt adv-sme-lex.txt \ noun-sme-lex.txt sme-num.txt @echo @echo "*** Extracting abbreviations from abbr-sme-lex.txt to abbr.txt ***" ; @echo @perl ../../script/abbr-extract.pl \ --abbr_lex=abbr-sme-lex.txt \ --lex=propernoun-sme-lex.txt,closed-sme-lex.txt,adv-sme-lex.txt,noun-sme-lex.txt,sme-num.txt \ --output=../bin/abbr.txt INTDIR := $(shell pwd | sed -e s/src/int/) # Here we build a transducer that gives us only the Sámi wordforms missing from # our transducers. Non-Sámi words from Norwegian, Finnish, English, etc. are filtered # out by this script, as are registered typos. missing: ../bin/missing ../bin/missing: @echo @echo "*** Generating missing ***"; @echo @printf "analyzer ${BINDIR}/sme.fst\n\ foreign ${BINDIR}/foreign.fst\n\ typos ${BINDIR}/typos.fst\n\n\ analyzer\n\ foreign\n\ typos\n" > $@ cap-sme: ../bin/cap-sme ../bin/cap-sme: @echo @echo "*** Generating cap-sme ***"; @echo @printf "analyzer ${BINDIR}/sme.fst\n\ allcaps ${INTDIR}/allcaps.fst\n\n\ allcaps analyzer \n" > $@ foreign.fst: ../bin/foreign.fst ../bin/foreign.fst: ../int/old-foreign.fst ../int/new-foreign.fst @echo @echo "*** Building a transducer for foreign words ***" ; @echo @printf "load stack ../int/old-foreign.fst \n\ load stack ../int/new-foreign.fst \n\ union net \n\ save stack ../bin/foreign.fst \n\ quit \n" > ../../tmp/foreign-sme-script $(XFST) < ../../tmp/foreign-sme-script @rm -f ../../tmp/foreign-sme-script foreign.fst: ../int/new-foreign.fst ../int/new-foreign.fst: ../../script/new-foreign.txt @echo @echo "*** Our transducer for new foreign words ***" ; @echo @printf "read text < ../../script/new-foreign.txt \n\ save stack ../int/new-foreign.fst \n\ quit \n" > ../../tmp/new-foreign-sme-script $(XFST) < ../../tmp/new-foreign-sme-script @rm -f ../../tmp/new-foreign-sme-script foreign.fst: ../int/old-foreign.fst ../int/old-foreign.fst: ../../script/old-foreign.txt @echo @echo "*** Our ready-built transducer for foreign words ***" ; @echo @printf "read text < ../../script/old-foreign.txt \n\ save stack ../int/old-foreign.fst \n\ quit \n" > ../../tmp/old-foreign-sme-script $(XFST) < ../../tmp/old-foreign-sme-script @rm -f ../../tmp/old-foreign-sme-script typos.fst: ../bin/typos.fst ../bin/typos.fst: ../int/typoslist.txt @echo @echo "*** Our transducer for typographical errors ***" ; @echo @printf "read text < ../int/typoslist.txt \n\ save stack ../bin/typos.fst \n\ quit \n" > ../../tmp/typos-sme-script $(XFST) < ../../tmp/typos-sme-script @rm -f ../../tmp/typos-sme-script typoslist.txt: ../int/typoslist.txt ../int/typoslist.txt: typos.txt @echo @echo "*** Our list of common typographical errors ***" ; @echo @cut -f1 typos.txt > ../int/typoslist.txt # ========== # # make clean # # ========== # # "make clean" is there to remove the binary files, as well as the # temporary nonrec files, at will. clean: @rm -f ../bin/*.fst ../bin/*.save ../bin/*.bin ../bin/abbr.txt *.nonrec @rm -f ../int/*.fst ../int/*.save ../int/*.int ../bin/abbr.txt *.nonrec # Another clean target to remove only the temp files for making the # non-circular transducer: rec-clean: @rm -f $(NONRECFILES)