# ******************************************************************** # # This is a common makefile that builds the Sami morphological parsers # # using the HFST tools. # # ******************************************************************** # # Usage: # make GTLANG=sme hfst HFST_FLAGS=-v HLEXC_FLAGS=$(HFST_FLAGS) HTWOLC_FLAGS=$(HFST_FLAGS) --resolve ZIP = /usr/bin/zip # Possible formats are: # - foma # - ofst-log # - ofst-tropical # - openfst # - openfst-log # - openfst-tropical # - optimized-lookup # - optimized-lookup-unweighted # - optimized-lookup-weighted # - sfst # This is the default format, it might be overridden for specific transducers: HFST_FORMAT=openfst-tropical # Main target. Produces analysers, generators and spellers. hfst: $(GTLANG).hfst hfstspellers # Target for all common regex files: common/bin/%.hfst: common/src/%.regex @echo @echo "*** Building $(@F) ***" ; @echo printf "read regex @re\"$<\" ; \n\ save stack $@ \n" > $@.script hfst-xfst -F $(HFST_FORMAT) -f $@.script rm -f $@.script # Target for all common xfst script files: common/bin/%.hfst: common/src/%.xfst @echo @echo "*** Building $(@F) ***" ; @echo printf "source $< \n\ save stack $@ \n" > $@.script hfst-xfst -F $(HFST_FORMAT) -f $@.script rm -f $@.script # Target for all COMMON twolc files: common/bin/%.hfst: common/src/%.twolc @echo @echo "*** Building $(@F) ***" ; @echo hfst-twolc $(HTWOLC_FLAGS) -i $< -o $@ # Target for language-specific twolc files: $(GTLANG)/bin/%.hfst: $(GTLANG)/hfst/%.twolc @echo @echo "*** Building $(@F) ***" ; @echo hfst-twolc $(HTWOLC_FLAGS) -i $< -o $@ # Compile the twolc file - it still needs M4 preprocessing twol-hfst \ $(GTLANG)/bin/$(GTLANG)-twol.hfst: $(GTLANG)/src/twol-$(GTLANG).txt @echo @echo "*** Building $(@F) ***" ; @echo hfst-twolc $(HTWOLC_FLAGS) -i $< -o $@ # Compile lexicon files, and add some initial weights to it: lexc-hfst: $(GTLANG)/bin/$(GTLANG)-lexc.hfst $(GTLANG)/bin/$(GTLANG)-lexc.hfst: $(SRCS) @echo @echo "*** Building $(@F) ***" ; @echo hfst-lexc $(HLEXC_FLAGS) -f foma -o $@.foma $^ hfst-fst2fst $(HLEXC_FLAGS) -i $@.foma -f $(HFST_FORMAT) | \ hfst-reweight -a 0.1 -S +Cmp | \ hfst-reweight -a 0.1 -S +Der | \ hfst-reweight -a 0.1 -S +Der1 | \ hfst-reweight -a 0.1 -S +Der2 | \ hfst-reweight -a 0.1 -S +Der3 | \ hfst-reweight -a 0.1 -S +Der4 | \ hfst-reweight -a 0.2 -S +Err/Sub \ -o $@ rm $@.foma # Compose&intersect lexicon and twol into a raw transducer save-hfst: $(GTLANG)/bin/$(GTLANG)-gen.hfst $(GTLANG)/bin/$(GTLANG)-gen.hfst: \ $(GTLANG)/bin/$(GTLANG)-lexc.hfst \ $(GTLANG)/bin/$(GTLANG)-twol.hfst @echo @echo "*** Building $(@F) ***" ; @echo hfst-compose-intersect $(HFST_FLAGS) $+ | \ hfst-determinize $(HFST_FLAGS) | \ hfst-remove-epsilons $(HFST_FLAGS) | \ hfst-minimize $(HFST_FLAGS) -o $@ # Remove some tags that are only used for internal processing, or noise in the # regular transducer. $(GTLANG)/bin/$(GTLANG)-tagsremoved.hfst: \ $(GTLANG)/bin/$(GTLANG)-gen.hfst \ common/src/usage-tags-remove.regex \ common/src/remove-morph-borders.regex \ common/src/remove-variant-homonym-tags.regex \ common/src/replace-der-with-subreading.regex @echo @echo "*** Building $(@F) ***" ; @echo hfst-fst2txt -i $< -o $<.att printf "read att $<.att \n\ define Lexicon ; \n\ define TagFilter @re\"common/src/usage-tags-remove.regex\" ; \n\ define VariantFilter @re\"common/src/remove-variant-homonym-tags.regex\" ; \n\ define Subreadings @re\"common/src/replace-der-with-subreading.regex\" ; \n\ define MorphBorderDel @re\"common/src/remove-morph-borders.regex\" ; \n\ read regex TagFilter .o. VariantFilter .o. Subreadings .o. Lexicon .o. MorphBorderDel ; \n\ save stack $@.tmp \n" > $@.script hfst-xfst -F $(HFST_FORMAT) -f $@.script hfst-fst2fst $(HLEXC_FLAGS) -i $@.tmp -f $(HFST_FORMAT) | \ hfst-reweight -a 0.1 -S +Cmp | \ hfst-reweight -a 0.1 -S +∏∏ | \ hfst-reweight -a 0.2 -S +Err/Sub \ -o $@ rm -f $<.att rm -f $@.script rm -f $@.tmp # Make some tags optional, and optimize the inverted (generating) transducer: ihfst i$(GTLANG).hfst: $(GTLANG)/bin/i$(GTLANG).hfstol $(GTLANG)/bin/i$(GTLANG).hfstol: \ $(GTLANG)/bin/$(GTLANG)-tagsremoved.hfst \ common/src/tag-not-save.regex @echo @echo "*** Building lookup-optimized $(@F) ***" ; @echo hfst-fst2txt -i $< -o $<.att printf "read att $<.att \n\ define Lexicon ; \n\ define GenTagFilter @re\"common/src/tag-not-save.regex\" ; \n\ read regex GenTagFilter .o. Lexicon ; \n\ save stack $@.tmp \n" > $@.script hfst-xfst -F $(HFST_FORMAT) -f $@.script hfst-determinize $(HFST_FLAGS) -i $@.tmp | \ hfst-minimize $(HFST_FLAGS) | \ hfst-fst2fst $(HFST_FLAGS) -O -o $@ rm -f $<.att rm -f $@.script rm -f $@.tmp # Add uppercasing upper-lexc-hfst: $(GTLANG)/bin/$(GTLANG)-upper.hfst $(GTLANG)/bin/$(GTLANG)-upper.hfst: \ $(GTLANG)/bin/$(GTLANG)-tagsremoved.hfst \ common/bin/uppercase-first.hfst @echo @echo "*** Building $(@F) ***" ; @echo cat $< | \ hfst-compose-intersect $(HFST_FLAGS) \ -2 common/bin/uppercase-first.hfst > $@ # Finally invert and optimize $(GTLANG).hfst: $(GTLANG)/bin/$(GTLANG).hfst ihfst $(GTLANG)/bin/$(GTLANG).hfst: $(GTLANG)/bin/$(GTLANG)-upper.hfst @echo @echo "*** Building $(@F) ***" ; @echo hfst-invert -i $< -o $@ @echo @echo "*** Building lookup-optimized $(@F)ol ***" ; @echo hfst-determinize $(HFST_FLAGS) -i $@ | \ hfst-minimize $(HFST_FLAGS) | \ hfst-fst2fst $(HFST_FLAGS) -O -o $@ol # Filter out all unwanted strings: $(GTLANG)/bin/$(GTLANG).filtered.hfst: \ $(GTLANG)/bin/$(GTLANG)-gen.hfst \ $(GTLANG)/src/focus-filter.regex \ common/src/derivation-filter.regex \ common/src/use-sub-filter.regex \ common/src/usage-tags-remove.regex \ common/src/use-NA-filter.regex @echo @echo "*** Building $(@F) ***" ; @echo hfst-fst2txt -i $< -o $<.att printf "read att $<.att \n\ define Lexicon ; \n\ define SubFilter @re\"common/src/use-sub-filter.regex\" ; \n\ define TagFilter @re\"common/src/usage-tags-remove.regex\" ; \n\ define DerFilter @re\"common/src/derivation-filter.regex\" ; \n\ define FocFilter @re\"$(GTLANG)/src/focus-filter.regex\" ; \n\ read regex TagFilter .o. DerFilter .o. FocFilter .o. SubFilter .o. Lexicon ; \n\ save stack $@ \n" > $@.script hfst-xfst -F $(HFST_FORMAT) -f $@.script rm -f $<.att rm -f $@.script # Finish normative transducer: hfstnorm: $(GTLANG)/bin/$(GTLANG)-norm.hfst $(GTLANG)/bin/$(GTLANG)-norm.hfst: $(GTLANG)/bin/$(GTLANG).filtered.hfst @echo @echo "*** Building $(@F) ***" ; @echo hfst-invert $< -o $@ @echo @echo "*** Building lookup-optimized $(@F)ol ***" ; @echo hfst-determinize $(HFST_FLAGS) -i $@ | \ hfst-fst2fst $(HFST_FLAGS) -f optimized-lookup-weighted -o $@ol ########################################### # Targets to test morphological transducers ########################################### hfsttest: $(GTLANG)/testing/$(GTLANG)-tests.yaml \ $(GTLANG)/bin/$(GTLANG).hfst \ $(GTLANG)/bin/i$(GTLANG).hfstol HfstTester.py -Cicv $< ########################################### # Targets to create spell checkers ########################################### # Filter out unwanted paths from the speller transducer: $(GTLANG)/bin/$(GTLANG).speller-filtered.hfst: \ $(GTLANG)/bin/$(GTLANG)-norm.hfst \ common/src/Punctuation-filter.regex @echo @echo "*** Building $(@F) ***" ; @echo hfst-fst2txt -i $< -o $<.att printf "read att $<.att \n\ define Lexicon ; \n\ define PunctFilter @re\"common/src/Punctuation-filter.regex\" ; \n\ read regex Lexicon .o. PunctFilter ; \n\ save stack $@ \n" > $@.script hfst-xfst -F $(HFST_FORMAT) -f $@.script rm -f $<.att rm -f $@.script # Project input side to create a one-level automat for spelling usage $(GTLANG)/bin/$(GTLANG).single.hfstol: $(GTLANG)/bin/$(GTLANG).speller-filtered.hfst @echo @echo "*** Building $(@F) ***" ; @echo hfst-project $(HFST_FLAGS) -p upper $< | \ hfst-fst2fst $(HFST_FLAGS) -f optimized-lookup-weighted -o $@ # Process the error model into a text file that can be compiled into a transducer: $(GTLANG)/bin/$(GTLANG).spl-errormodel.hfstol: \ $(GTLANG)/hfst/default-error-model.txt \ $(GTLANG)/bin/$(GTLANG).single.hfstol \ script/editdist.py @echo @echo "*** Building $(@F) ***" ; @echo script/editdist.py -v -s -d 2 -e '@0@' -i $< \ -a $(GTLANG)/bin/$(GTLANG).single.hfstol | \ hfst-txt2fst $(HFST_FLAGS) -e '@0@' | \ hfst-fst2fst $(HFST_FLAGS) -f optimized-lookup-weighted -o $@ hfstspellers: zhfst voikkohfst voikkoinstall # Create a ZHFST speller archive out of the available transducers: zhfst: $(GTLANG)/hfst/$(GTLANG)-speller.zhfst $(GTLANG)/hfst/$(GTLANG)-speller.zhfst: \ $(GTLANG)/bin/$(GTLANG).single.hfstol \ $(GTLANG)/bin/$(GTLANG).spl-errormodel.hfstol \ $(GTLANG)/hfst/index.xml @echo @echo "*** Building $(@F) ***" ; @echo cp -f $(GTLANG)/bin/$(GTLANG).single.hfstol $(GTLANG)/hfst/acceptor.default.hfstol cp -f $(GTLANG)/bin/$(GTLANG).spl-errormodel.hfstol $(GTLANG)/hfst/errmodel.default.hfstol rm -f $(GTLANG)/hfst/$(GTLANG)-speller.zhfst cd $(GTLANG)/hfst/ && $(ZIP) -v -9 $(@F) \ index.xml acceptor.default.hfstol errmodel.default.hfstol voikkohfst: @echo @echo "*** Building GT Voikko+HFST speller for $(GTLANG) ***" ; @echo mkdir -p $(GTLANG)/voikko/2/mor-$(GTLANG) cd $(GTLANG)/voikko/2/mor-$(GTLANG) && \ ln -sf ../../../hfst/$(GTLANG)-speller.zhfst speller.zhfst voikkoinstall: @echo @echo "*** Installing GT Voikko+HFST speller for $(GTLANG) ***" ; @echo mkdir -p ~/.voikko/2/mor-$(GTSHORTLANG) cp -f $(GTLANG)/hfst/$(GTLANG)-speller.zhfst \ ~/.voikko/2/mor-$(GTSHORTLANG)/speller.zhfst sed 's/: $(GTLANG)/: $(GTSHORTLANG)/' $(GTLANG)/voikko/2/mor-$(GTLANG)/voikko-fi_FI.pro \ > ~/.voikko/2/mor-$(GTSHORTLANG)/voikko-fi_FI.pro # Compile the OCR correction regex into a weighted transducer ocr: $(GTLANG)/bin/ocr-errmodel.hfst $(GTLANG)/bin/ocr-errmodel.hfst: \ $(GTLANG)/hfst/ocr-errmodel.regex \ $(GTLANG)/bin/$(GTLANG).symbols.txt @echo @echo "*** Building $(@F) ***" ; @echo hfst-regexp2fst $(HFST_FLAGS) \ -i $< \ -R $(GTLANG)/bin/$(GTLANG).symbols.txt | \ hfst-repeat $(HFST_FLAGS) -t 3 | \ hfst-fst2fst $(HFST_FLAGS) -O -o $@