# ******************************************************************** # # This is a common makefile that connects to all the other makefiles # # ******************************************************************** # # Version: $Id$ sinclude $(TARGET)/Makefile # language-specific variables esp. SRCS files. include analyser.mk include mk-files/disamb.mk include mk-files/speller.mk include mk-files/hyph.mk include mk-files/phonrules.mk # =============================== # # Variable definitions # # =============================== # M4 = m4 M4FLAGS = DOC = SHORTDOC = $(shell basename $(DOC) | cut -d "." -f1 | tr -d '-' ) # Tools used when compiling the transducers UFST = /opt/sami/xerox/c-fsm/ix86-linux2.6-gcc3.4/bin/fst -utf8 CFST = xfst #-utf8 XFST = xfst #-utf8 #TWOLC = twolc #-utf8 FSTTOOL = xfst #-utf8 TWOLCTOOL = twolc #-utf8 LEXC = lexc #-utf8 ifeq ($(MORPHOPHONTOOL), xfst) tust: @echo "***hoi***" COMPILER = $(FSTTOOL) else tust: @echo "***hei***" COMPILER = $(TWOLCTOOL) endif GUNZIP = /usr/bin/gunzip SCP = scp -p SSH = ssh VISLCG3 = vislcg3 # Some other tools SORT = /sw/bin/sort REZ = /Developer/Tools/Rez SETFILE = /Developer/Tools/SetFile HYPHCORR = script/hyphenConverter.sh # Version-related info: DATE = $(shell date +%Y%m%d) VERSION = $(shell cat $(TARGET)/polderland/version.txt | tr -d " ") ifeq (victorio.uit.no, $(shell hostname)) CFST = /opt/sami/xerox/c-fsm/ix86-linux2.6-gcc3.4/bin/fst -utf8 XFST = /opt/sami/xerox/bin/xfst -utf8 LEXC = /opt/sami/xerox/bin/lexc -utf8 TWOLC = /opt/sami/xerox/bin/twolc -utf8 endif # aSpell tools ASPELL = aspell ASPELL_FLAGS = --encoding=utf-8 PREZIP = prezip-bin lang = se aspell_version = 0.1-1 cwl_files = $(TARGET)/aspell/$(lang).cwl data_files = $(TARGET)/aspell/$(lang)_affix.dat \ $(TARGET)/aspell/$(lang).dat \ $(TARGET)/aspell/l_$(lang).cset \ $(TARGET)/aspell/l_$(lang).cmap doc_files = extra_files = $(TARGET)/aspell/configure \ $(TARGET)/aspell/info \ $(TARGET)/aspell/Makefile.pre multi_files = $(TARGET)/aspell/$(lang).multi rws_files = $(TARGET)/aspell/$(lang).rws aspell_distdir = aspell6-$(lang)-$(aspell_version) FSTBASE = @\"common/bin/nohardhyphen.fst\" .o. \ @\"common/bin/downcase.fst\" FST = ( @\"$(TARGET)/bin/$(TARGET).save\" .o. \ @\"common/bin/caseconv.fst\" .o. $(FSTBASE) .o. \ @\"common/bin/spellrelax.fst\" ) \ | @\"common/bin/webadr.fst\" FSTNORM = @\"$(TARGET)/bin/$(TARGET)-norm.save\" .o. \ @\"common/bin/downcase.fst\" .o. $(FSTBASE) FSTRESTR = @\"$(TARGET)/bin/$(TARGET)-restr.save\" .o. \ @\"common/bin/downcase.fst\" .o. $(FSTBASE) INVERTNET = read regex [ [@\"common/bin/tag-not-save.fst\" ] .o. \ [@\"$(TARGET)/bin/$(TARGET).save\"] .o. \ [@\"common/bin/downcase.fst\" ] \ ] ; \n TAGINCL = $(TARGET)/int/$(TARGET)-lex.spel \ $(TARGET)/int/propernoun-$(TARGET)-lex-tmp.spel \ $(TARGET)/int/noun-$(TARGET)-lex.spel \ $(TARGET)/int/verb-$(TARGET)-lex.spel \ $(TARGET)/int/adj-$(TARGET)-lex.spel \ $(TARGET)/int/abbr-$(TARGET)-lex.spel INVERTHYPH = read regex [ [@\"common/bin/tag-not-save.fst\" ] .o. \ [@\"$(TARGET)/bin/derivation-filter.fst\"] .o. \ [@\"$(TARGET)/bin/hyph-$(TARGET).save\" ] .o. \ [@\"common/bin/downcase.fst\" ] .o. \ [@\"$(TARGET)/bin/hyphrules-$(TARGET).fst\".i ] \ ] ; \n INVERTNORM = read regex [ [@\"common/bin/tag-not-save.fst\" ] .o. \ [@\"$(TARGET)/bin/derivation-filter.fst\"] .o. \ [@\"$(TARGET)/bin/$(TARGET)-norm.save\" ] .o. \ [@\"common/bin/downcase.fst\" ] \ ] ; \n INVERTRESTR = read regex [ [@\"common/bin/tag-not-save.fst\" ] .o. \ [@\"$(TARGET)/bin/derivation-filter.fst\"] .o. \ [@\"$(TARGET)/bin/$(TARGET)-restr.save\" ] .o. \ [@\"common/bin/downcase.fst\" ] \ ] ; \n SPLRNONREC = read regex [ [@\"$(TARGET)/bin/derivation-filter.fst\" ] .o. \ [@\"$(TARGET)/bin/spellernonrec-$(TARGET).save\" ] .o. \ [@\"common/bin/downcase.fst\" ] .o. \ [@\"common/bin/remove-hyphen.fst\" ] .o. \ [@\"$(TARGET)/bin/hyphrules-$(TARGET).fst\".i ] \ ] ; \n NONREC = read regex [ [@\"$(TARGET)/bin/derivation-filter.fst\"] .o. \ [@\"$(TARGET)/bin/nonrec-$(TARGET).save\"] .o. \ [@\"common/bin/downcase.fst\" ] \ ] ; \n HYPH = read regex ( @\"$(TARGET)/bin/hyphrules-$(TARGET).fst\" .o. \ @\"$(TARGET)/bin/hyph-i$(TARGET).save\" .o. \ @\"$(TARGET)/bin/$(TARGET)-norm.fst\" ) ; \n common: fst \ inverse.fst \ foreign.fst \ num.fst \ webadr.fst \ missing # Here we make full-form data files # The generated word list is sent directly stdout, and then to gzip # This is to avoid breaking a 2Gb file size limit in the Xerox tools. # THIS COMMAND CAN ONLY BE RUN SUCCESSFULLY ON VICTORIO! # It requires the commercial fst tool to be able to print all lower-words. wordlist: $(TARGET)/wordlist-$(TARGET).txt $(TARGET)/wordlist-$(TARGET).txt: $(TARGET)/bin/nonrec-$(TARGET).fst @echo @echo "*** Building $(TARGET) full-form wordlist ***" @echo @printf "load stack < $< \n\ lower-words \n\ quit \n" > tmp/wordlist-script $(CFST) -f tmp/wordlist-script -q | gzip -f > $@.gz @rm -f tmp/wordlist-script printlarge: $(TARGET)/polderland/large-$(TARGET)-plx.txt.gz $(TARGET)/polderland/large-$(TARGET)-plx.txt.gz: \ $(TARGET)/bin/spellerverbs-$(TARGET)-plx.fst \ $(TARGET)/bin/spellernouns-$(TARGET)-plx.fst \ $(TARGET)/bin/spelleradjs-$(TARGET)-plx.fst \ $(TARGET)/bin/spellerproper-$(TARGET)-plx.fst \ common/bin/hyphen-convert.fst @echo @echo "*** Printing ONE large (sorted) file, $(TARGET) ***" @echo @printf "load stack < $(TARGET)/bin/spellerverbs-$(TARGET)-plx.fst \n\ load stack < $(TARGET)/bin/spellernouns-$(TARGET)-plx.fst \n\ load stack < $(TARGET)/bin/spelleradjs-$(TARGET)-plx.fst \n\ load stack < $(TARGET)/bin/spellerproper-$(TARGET)-plx.fst \n\ union net \n\ load stack < common/bin/hyphen-convert.fst \n\ turn stack \n\ compose net \n\ lower-side \n\ sort \n\ print words \n\ quit \n" > tmp/largelist-script LANG= $(CFST) -f tmp/largelist-script -q | \ gzip -f > $@ @rm -f tmp/largelist-script # Downloads the specified files from our public server to the relevant # polderland folder. plx-download: @echo @echo "*** Downloading $(TARGET) PLX files ***" @echo $(LYNX) $(SRCSITE)/adj-$(TARGET)-plx.txt.gz \ > $(TARGET)/polderland/adj-$(TARGET)-plx.txt.gz $(LYNX) $(SRCSITE)/noun-$(TARGET)-plx.txt.gz \ > $(TARGET)/polderland/noun-$(TARGET)-plx.txt.gz $(LYNX) $(SRCSITE)/propernoun-$(TARGET)-plx.txt.gz \ > $(TARGET)/polderland/propernoun-$(TARGET)-plx.txt.gz $(LYNX) $(SRCSITE)/verb-$(TARGET)-plx.txt.gz \ > $(TARGET)/polderland/verb-$(TARGET)-plx.txt.gz # Here we make the abbrevation file for our current preprocessor, # the perl-based preprocess (located in the script catalogue) empty:= comma:=, space:=$(empty) $(empty) ABBRSRCS=$(subst $(space),$(comma),$(LEXICALSRCS)) abbr: $(TARGET)/bin/abbr.txt $(TARGET)/bin/abbr.txt: script/abbr-extract script/langTools/Util.pm $(LEXICALSRCS) \ $(TARGET)/src/$(TARGET)-num.txt \ $(TARGET)/bin/i$(TARGET).fst \ cwb/paradigm.txt cwb/korpustags.txt @echo @echo "*** Extracting abbreviations from abbr-$(TARGET)-lex.txt to abbr.txt ***" ; @echo @perl -I script script/abbr-extract \ --paradigm=cwb/paradigm.txt \ --tags=cwb/korpustags.txt \ --fst=$(TARGET)/bin/i$(TARGET).fst \ --output=$@ \ --abbr_lex=$(TARGET)/src/abbr-$(TARGET)-lex.txt \ --lex=$(ABBRSRCS),$(TARGET)/src/$(TARGET)-num.txt,$(TARGET)/src/abbr-$(TARGET)-lex.txt # Here we build a transducer that gives us only the Sámi wordforms missing from # our transducers. Non-Sámi words from Norwegian, Finnish, English, etc. are # filtered out by this script, as are registered typos. missing: $(TARGET)/bin/missing $(TARGET)/bin/missing: @echo @echo "*** Generating missing ***"; @echo @printf "analyzer bin/$(TARGET).fst\n\ foreign ../common/bin/foreign.fst\n\ typos ../common/bin/typos.fst\n\ webadr ../common/bin/webadr.fst\n\n\ analyzer\n\ foreign\n\ typos\n\ webadr\n" > $@ cap-$(TARGET): $(TARGET)/bin/cap-$(TARGET) $(TARGET)/bin/cap-$(TARGET): @echo @echo "*** Generating cap-$(TARGET) ***"; @echo @printf "analyzer $(TARGET)/bin/$(TARGET).fst\n\ allcaps common/bin/allcaps.fst\n\n\ allcaps analyzer \n" > $@ #CHECK THIS!! Move this file typoslist.txt: common/bin/typoslist.txt common/bin/typoslist.txt: $(TARGET)/src/typos.txt @echo @echo "*** Our list of common typographical errors ***" ; @echo @cut -f1 $< > $@ # Finally an option to remove all the binary files clean: @rm -f common/bin/*.fst common/bin/*.save common/bin/*.bin common/bin/*.txt # @rm -f common/int/*.fst @rm -f $(TARGET)/bin/*.fst $(TARGET)/bin/*.save $(TARGET)/bin/*.bin \ $(TARGET)/bin/*.rle @rm -f $(TARGET)/bin/cap-$(TARGET) $(TARGET)/bin/missing @rm -f $(TARGET)/bin/abbr.txt rec-clean: @rm -f $(NONRECFILES) speller-clean: clean @rm -f $(TARGET)/polderland/$(WINSPELL) @rm -f $(TARGET)/polderland/$(MACSPELL) @rm -f $(TARGET)/polderland/middle-noun-$(TARGET)-plx.txt @rm -f $(TARGET)/polderland/generated_nums-plx.txt @rm -f $(PLXSRCfst) @rm -f $(PLXSRCjava) @rm -r tmp/$(TARGET)/* @rmdir tmp/$(TARGET) @rm -f tmp/* @rm -f $(TARGET)/int/*spel @rm -f $(TARGET)/hunspell/*sme.txt