## Process this file with automake to produce Makefile.in

## Copyright (C) 2011 Samediggi

## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.

## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.

## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.

####### Add corpus-based weights: #######
# Use: surfweights.CORPUSNAME.hfst and CORPUSNAME.unitweight
SURFWEIGHTS=$(CORPUSNAME).surfweights.hfst
UNITWEIGHT=$(CORPUSNAME).unitweight.txt
# Use this as the source lexical fst for unit weighting, it contains correct
# surface forms except for the word boundary #, which is still present, and
# used in the weighting:
UW_SPELLER_SRC=generator-mobilespeller-gt-norm-base.hfst
# TSV file defining weights per tag
TAGWEIGHTS=tags.reweight
# ALPHA for additive smoothing, [0, 1.0] seems good
ALPHA=1.0
#NORMALISED_MAXWEIGHT=1000

corpus_size_limit_command=$(shell \
	if [[ x$(CORPUS_SIZE) != x ]] ; \
	then \
		echo "| head -n $(CORPUS_SIZE)"; \
	else \
		echo ""; \
	fi)

%.hfst: weighting/%.att
	$(AM_V_GEN)$(HFST_TXT2FST) $(HFST_FLAGS) -f openfst-tropical $< -o $@

# copy cleaned corpus into the local dir:
$(CORPUSNAME).clean.txt: \
	weighting/$(CORPUSNAME).clean.txt
	$(AM_V_CP)cp -f $< $@

# sort the clean corpus:
%.sort.txt: %.clean.txt
	$(AM_V_GEN)sort < $< > $@

# token count:
%.wordcount.txt: %.sort.txt
	$(AM_V_GEN)wc -l < $< > $@

# Unique the sorted, clean corpus:
%.uniq.txt: %.sort.txt
	$(AM_V_GEN)uniq -c < $< | sort -nr $(corpus_size_limit_command) > $@

# type count:
%.typecount.txt: %.uniq.txt
	$(AM_V_GEN)wc -l < $< > $@

# calculate unit weight, smoothed using ALPHA:
#%.unitweight.txt: %.wordcount.txt %.typecount.txt
#	$(AM_V_GEN)paste $^ |\
#		sed -e "s/^/scale=5; -l($(ALPHA)\/(/" \
#		    -e "s/	/ + ($(ALPHA) */" -e "s/$$/)))/" \
#		| $(BC) -l > $@

# Alternative unit weight: highest tropical weight + ALPHA:
%.unitweight.txt: %.tropical.txt
	$(AM_V_GEN)echo "$$(cut -f2 < $^ | sort -nru | head -n1) + $(ALPHA)" \
		| $(BC) -l > $@

# add tropical weights to the corpus:
%.tropical.txt: %.uniq.txt %.wordcount.txt %.typecount.txt
	$(AM_V_GEN)cat $< |\
		$(GAWK) -v CS="$$(cat $*.wordcount.txt)" \
				-v DS="$$(cat $*.typecount.txt)" \
				-v ALPHA=$(ALPHA)                \
				-f $(GTCORE)/scripts/uniq_count2tropical_weight.awk \
				> $@

# build an fst of surface forms with tropical weights for each word form:
%.surfs.hfst: %.tropical.txt
	$(AM_V_STR2FST)cat $< |\
		$(HFST_STRINGS2FST) -j $(HFST_FLAGS) -f openfst-tropical -o $@

# Build an fst with surface form weights that also handles compounds:
%.surfweights.hfst: %.surfs.hfst \
                    word-boundary.hfst
	$(AM_V_HCONCAT)$(HFST_CONCATENATE) $(HFST_FLAGS) word-boundary.hfst $< \
		| $(HFST_REPEAT) $(HFST_FLAGS) -f 0 -t inf \
		| $(HFST_CONCATENATE) $< - \
		| $(HFST_MINIMIZE) -o $@

# Add the unit weight to each unit in compounds, both dynamic and lexical:
unitweighted.hfst: $(UW_SPELLER_SRC) \
                   $(UNITWEIGHT)         \
                   $(srcdir)/weighting/word-boundary.txt
	$(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(HFST_FLAGS) \
		-e -a $$(cat $(UNITWEIGHT)) $< \
		-o $@

# Keep these intermediate targets when building using --debug:
.SECONDARY: spellercorpus.sort.txt \
            spellercorpus.uniq.txt \
            spellercorpus.surfs.hfst \
            spellercorpus.tropical.txt \
            spellercorpus.typecount.txt \
            spellercorpus.wordcount.txt \
            word-boundary.hfst \
            generator-fstspeller-gt-norm-freq_weighted.hfst \
            generator-fstspeller-gt-norm-unit_weighted.hfst \
            generator-fstspeller-gt-norm-norm_weighted.hfst \
            generator-fstspeller-gt-norm-tag_weighted.hfst  \
            $(SURFWEIGHTS) \
            $(UNITWEIGHT)