## Process this file with automake to produce Makefile.in ## Copyright (C) 2011 Samediggi ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ####### Add corpus-based weights: ####### # Use: surfweights.CORPUSNAME.hfst and CORPUSNAME.unitweight SURFWEIGHTS=$(CORPUSNAME).surfweights.hfst UNITWEIGHT=$(CORPUSNAME).unitweight.txt # Use this as the source lexical fst for unit weighting, it contains correct # surface forms except for the word boundary #, which is still present, and # used in the weighting: UW_SPELLER_SRC=generator-mobilespeller-gt-norm-base.hfst # TSV file defining weights per tag TAGWEIGHTS=tags.reweight # ALPHA for additive smoothing, [0, 1.0] seems good ALPHA=1.0 #NORMALISED_MAXWEIGHT=1000 corpus_size_limit_command=$(shell \ if [[ x$(CORPUS_SIZE) != x ]] ; \ then \ echo "| head -n $(CORPUS_SIZE)"; \ else \ echo ""; \ fi) %.hfst: weighting/%.att $(AM_V_GEN)$(HFST_TXT2FST) $(HFST_FLAGS) -f openfst-tropical $< -o $@ # copy cleaned corpus into the local dir: $(CORPUSNAME).clean.txt: \ weighting/$(CORPUSNAME).clean.txt $(AM_V_CP)cp -f $< $@ # sort the clean corpus: %.sort.txt: %.clean.txt $(AM_V_GEN)sort < $< > $@ # token count: %.wordcount.txt: %.sort.txt $(AM_V_GEN)wc -l < $< > $@ # Unique the sorted, clean corpus: %.uniq.txt: %.sort.txt $(AM_V_GEN)uniq -c < $< | sort -nr $(corpus_size_limit_command) > $@ # type count: %.typecount.txt: %.uniq.txt $(AM_V_GEN)wc -l < $< > $@ # calculate unit weight, smoothed using ALPHA: #%.unitweight.txt: %.wordcount.txt %.typecount.txt # $(AM_V_GEN)paste $^ |\ # sed -e "s/^/scale=5; -l($(ALPHA)\/(/" \ # -e "s/ / + ($(ALPHA) */" -e "s/$$/)))/" \ # | $(BC) -l > $@ # Alternative unit weight: highest tropical weight + ALPHA: %.unitweight.txt: %.tropical.txt $(AM_V_GEN)echo "$$(cut -f2 < $^ | sort -nru | head -n1) + $(ALPHA)" \ | $(BC) -l > $@ # add tropical weights to the corpus: %.tropical.txt: %.uniq.txt %.wordcount.txt %.typecount.txt $(AM_V_GEN)cat $< |\ $(GAWK) -v CS="$$(cat $*.wordcount.txt)" \ -v DS="$$(cat $*.typecount.txt)" \ -v ALPHA=$(ALPHA) \ -f $(GTCORE)/scripts/uniq_count2tropical_weight.awk \ > $@ # build an fst of surface forms with tropical weights for each word form: %.surfs.hfst: %.tropical.txt $(AM_V_STR2FST)cat $< |\ $(HFST_STRINGS2FST) -j $(HFST_FLAGS) -f openfst-tropical -o $@ # Build an fst with surface form weights that also handles compounds: %.surfweights.hfst: %.surfs.hfst \ word-boundary.hfst $(AM_V_HCONCAT)$(HFST_CONCATENATE) $(HFST_FLAGS) word-boundary.hfst $< \ | $(HFST_REPEAT) $(HFST_FLAGS) -f 0 -t inf \ | $(HFST_CONCATENATE) $< - \ | $(HFST_MINIMIZE) -o $@ # Add the unit weight to each unit in compounds, both dynamic and lexical: unitweighted.hfst: $(UW_SPELLER_SRC) \ $(UNITWEIGHT) \ $(srcdir)/weighting/word-boundary.txt $(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(HFST_FLAGS) \ -e -a $$(cat $(UNITWEIGHT)) $< \ -o $@ # Keep these intermediate targets when building using --debug: .SECONDARY: spellercorpus.sort.txt \ spellercorpus.uniq.txt \ spellercorpus.surfs.hfst \ spellercorpus.tropical.txt \ spellercorpus.typecount.txt \ spellercorpus.wordcount.txt \ word-boundary.hfst \ generator-fstspeller-gt-norm-freq_weighted.hfst \ generator-fstspeller-gt-norm-unit_weighted.hfst \ generator-fstspeller-gt-norm-norm_weighted.hfst \ generator-fstspeller-gt-norm-tag_weighted.hfst \ $(SURFWEIGHTS) \ $(UNITWEIGHT)