############################################# # Makefile for the proofing tools test bench. ############################################# # Find out whether we have access to the bound corpus: BOUNDTEST = ifeq (victorio.uit.no, $(shell hostname)) BOUNDTEST = yes else ifeq (divvun.no, $(shell hostname)) BOUNDTEST = yes endif # The $DOC variable should always be specified. The below string is just a dummy # placeholder to get rid of error messages. The variable is only used in certain # speller test targets, you can ignore it in all other cases. DOC = tmp/GoldstandardTexts.txt SHORTDOC = $(shell basename $(DOC) .xml ) BDOC = tmp/GoldstandardBoundTexts.txt SHORTBDOC = $(shell basename $(BDOC) .xml ) REVISION = $(shell cat $(GTLANG)/polderland/revision.txt) TESTTIME = $(shell date +%H%M) # Tools: LN = ln -sf MV = mv -f LXC2SPELLDIR = ../tools/lexc2xspell # Language-dependent Polderland tools: ifeq ($(GTLANG), sme) SPELL = $(PLTOOLSDIR)/spellSamiNort HYPHEN = $(PLTOOLSDIR)/SamiNortHyphMac endif ifeq ($(GTLANG), smj) SPELL = $(PLTOOLSDIR)/spellSamiLule HYPHEN = $(PLTOOLSDIR)/SamiLuleHyphMac endif ifeq ($(GTLANG), sma) SPELL = $(PLTOOLSDIR)/spellSamiSout HYPHEN = $(PLTOOLSDIR)/SamiSoutHyphMac endif # Speller testing: PROOFTESTBASE = $(GTBIG)/techdoc/proof ifeq ($(LIC),bound) PROOFTESTBASE = $(GTPRIV)/techdoc/proof endif SPLTESTREPDIR = $(PROOFTESTBASE)/spelling/testing HYPTESTREPDIR = $(PROOFTESTBASE)/hyph/testing TESTTOOL = pl TESTXSLDIR = ../xtdoc/sd/src/documentation/resources/stylesheets PROOFDISTDIR = ../prooftools PROOFDISTSHARED = $(PROOFDISTDIR)/toollibs/shared WORDVERSION = 2004 # Use the speller lexicons found in the prooftools dir. # To make sure you have the latest speller lexicons, do: # cd $GTHOME/prooftools # make mslex-download # then go back here, and run the tests you want. SPELLERLEX = $(PROOFDISTDIR)/tmp/download/$(WINSPELL) # Regression file location depends on the test tool; # default is the polderland dir (used by both pl and mw tools): SPTOOLDIR = polderland ifeq ($(TESTTOOL), hu) SPTOOLDIR = hunspell else ifeq ($(TESTTOOL), vk) SPTOOLDIR = voikko else ifeq ($(TESTTOOL), vkmalaga) SPTOOLDIR = voikko else ifeq ($(TESTTOOL), vkhfst) SPTOOLDIR = voikko else ifeq ($(TESTTOOL), hfst) SPTOOLDIR = hfst endif ##### Default target: ###### #all: spelltest - not possible as long as we are part of the main makefile - it # will default all gt/ making to testing the proofing tools!!! ############################ download: download-pl download-pl: cd ../prooftools && make mslex-download ##################################### # Targets to test proofing tools ##################################### # A general target, to run all speller tests at once (add more dependent # targets as they become available): ifeq (xyes, x$(BOUNDTEST)) spelltest: svnup typos-test wordtype-test baseform-test regression-test correct-test else spelltest: svnup typos-test wordtype-test baseform-test correct-test regression-test endif svnup: svn -q up . $$GTFREE/stable/goldstandard/ $$GTBOUND/stable/goldstandard/ ##################################### # Short-hand targets: # Test speller on typos.txt: typos-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/typos/$(DATE)-$(TESTTIME)-typos.xml # Check that all words in past and present bug reports are correctly dealt with: regression-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/regression/$(DATE)-$(TESTTIME)-regression.xml # Run the baseform self test - check that all baseforms in our lexicons are # correctly recognised: baseform-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/baseform/$(DATE)-$(TESTTIME)-baseform.xml # Run the wordtype test - check that a number word construction types are # correctly recognised or corrected (in case of spelling errors): wordtype-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/wordtype/$(DATE)-$(TESTTIME)-wordtype.xml # Run the paradigm self test - check that all inflected forms of a selected # set of words are recognised: paradigm-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/paradigm/$(DATE)-$(TESTTIME)-paradigm.xml # Test speller on correct-marked corpus docs, including bound if on vic or XS: ifeq (xyes, x$(BOUNDTEST)) correct-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/goldstandard/$(DATE)-$(TESTTIME)-$(SHORTDOC).xml \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/goldstandard/$(DATE)-$(TESTTIME)-$(SHORTBDOC).xml else correct-test: \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/goldstandard/$(DATE)-$(TESTTIME)-$(SHORTDOC).xml endif ##################################### # Target to create graphs of some of the speller test results data: spellergraphs: @python3.2 $$GTHOME/gt/script/dvchart.py \ $$GTBIG/techdoc/proof/spelling/testing/ \ $$GTHOME/xtdoc/techdoc/src/documentation/skins/pelt/scripts/ ##################################### # The final target is a simple but sufficient xml format, # to which the raw test data is converted: $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/typos/$(DATE)-$(TESTTIME)-%.xml \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/baseform/$(DATE)-$(TESTTIME)-%.xml \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/wordtype/$(DATE)-$(TESTTIME)-%.xml \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/paradigm/$(DATE)-$(TESTTIME)-%.xml \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/regression/$(DATE)-$(TESTTIME)-%.xml \ $(SPLTESTREPDIR)/$(GTLANG)/$(TESTTOOL)/goldstandard/$(DATE)-$(TESTTIME)-%.xml : \ tmp/sp-%-$(TESTTOOL)-$(GTLANG).txt \ tmp/sp-%-$(TESTTOOL)-$(GTLANG).txt.out \ $(GTLANG)/$(SPTOOLDIR)/version.txt \ $(SCRIPTDIR)/speller-testres.pl @echo @echo "*** Collecting $* results, transforming to XML. Output: ***" ; @echo @echo "$@" ; @echo mkdir -p $(@D) $(SCRIPTDIR)/speller-testres.pl \ --$(TESTTOOL) \ --input=$< \ --output=$<.out \ --document=$( $@ 2> $@.time $(SPELL) -u8 -0 -d -m $(SPELLERLEX) \ < $<.c1 > $@ rm -f userdict # Add speller tool version here $(SPELL) --version 2>&1 | rev | cut -d'/' -f1 | rev \ > tmp/sp-$(TESTTOOL)-version.txt # ... speller in MS Word through AppleScript. # Due to MS Word's definition of a word, we can't include words with certain # chars as part of the input data:-( The chars are: - . : # # NB! This target can only run on Macs with MS Word installed! # For best behaviour, uncheck Preferences>General>Show Gallery at Startup # in MS Word. .PRECIOUS: tmp/sp-%-mw-$(GTLANG).txt.out tmp/sp-%-mw-$(GTLANG).txt.out: tmp/sp-%-mw-$(GTLANG).txt \ $(SCRIPTDIR)/spellcheckWithMSWord.applescript @echo @echo "*** Running $* $(TESTTOOL) test - $(GTLANG) ***" ; @echo @grep -v '[-.:]' $<.c1 | tr '\n' ' ' | iconv -f UTF-8 \ -t UTF-16 > $<.c1.utf16 iconv -f UTF-16 -t UTF-8 < $<.c1.utf16 | pbcopy osascript $(SCRIPTDIR)/spellcheckWithMSWord.applescript \ $(GTLANG) \ `pwd`/$<.c1.utf16 \ `pwd`/$@.utf16 \ `pwd`/tmp/sp-$(TESTTOOL)-version.txt \ $(WORDVERSION) @iconv -f UTF-16 -t UTF-8 $@.utf16 > $@ @rm -f $@.utf16 # ...hunspell command line speller: .PRECIOUS: tmp/sp-%-hu-$(GTLANG).txt.out tmp/sp-%-hu-$(GTLANG).txt.out: tmp/sp-%-hu-$(GTLANG).txt @echo @echo "*** Running $* $(TESTTOOL) test - $(GTLANG) ***" ; @echo # Add speller tool version: hunspell --version | head -n 1 > tmp/sp-$(TESTTOOL)-version.txt # Run the actual speller: hunspell -a --check-url -d $(GTLANG)/hunspell/$(GTLANG) $<.c1 > $@ # ...voikko command line speller, malaga backend (Finnish only): .PRECIOUS: tmp/sp-%-vkmalaga-$(GTLANG).txt.out tmp/sp-%-vkmalaga-$(GTLANG).txt.out: tmp/sp-%-vkmalaga-$(GTLANG).txt @echo @echo "*** Running $* $(TESTTOOL) test - $(GTLANG) ***" ; @echo @voikkospell -s ignore_dot=1 < $<.c1 > $@ # Add speller tool version here @voikkospell --version > tmp/sp-$(TESTTOOL)-version.txt # ...voikko command line speller, hfst backend (any hfst language): .PRECIOUS: tmp/sp-%-vkhfst-$(GTLANG).txt.out tmp/sp-%-vkhfst-$(GTLANG).txt.out: tmp/sp-%-vkhfst-$(GTLANG).txt @echo @echo "*** Running $* $(TESTTOOL) test - $(GTLANG) ***" ; @echo @voikkospell -s -d $(GTLANG) -p $(GTLANG)/voikko/ ignore_dot=1 < $<.c1 > $@ # Add speller tool version here @voikkospell --version > tmp/sp-$(TESTTOOL)-version.txt # ...hfst command-line speller tool (library front end): .PRECIOUS: tmp/sp-%-hfst-$(GTLANG).txt.out tmp/sp-%-hfst-$(GTLANG).txt.out: tmp/sp-%-hfst-$(GTLANG).txt @echo @echo "*** Running $* $(TESTTOOL) test - $(GTLANG) ***" ; @echo hfst-ospell $(GTLANG)/hfst/$(GTLANG)-speller.zhfst < $<.c1 > $@ # Add speller tool version here @hfst-ospell --version 2> tmp/sp-$(TESTTOOL)-version.txt ##################################### # Prepare input data for spell-checking - these sections are specific to each # test type. ##################################### # Typos-test input preprocessing: tmp/sp-typos-$(TESTTOOL)-$(GTLANG).txt: $(GTLANG)/src/typos.txt @echo @echo "*** Preparing typos test - $(GTLANG) ***" ; @echo # Add easter egg trigger, and remove unwanted lines: @echo "nuvviDspeller Divvun" | cat - $< | \ grep -v '^[[:alnum:]]* ' | \ grep -v '^#' | \ grep -v '^!' | \ grep -v '^$$' > $@ # Extract the second column, and add it as correct input: @grep -v '^#' $< | \ grep -v '^$$' | \ cut -f2 | \ tr ' ' '\n' | \ grep -v '^.$$' | \ perl -ple 's/^(.*)$$/$$1\t/;' >> $@ # Extract the first column to use as input for spell checker: @cut -f1 $@ > $@.c1 ##################################### # Correct-corpus test input preprocessing: .PHONY: tmp/sp-$(SHORTDOC)-$(TESTTOOL)-$(GTLANG).txt tmp/sp-$(SHORTDOC)-$(TESTTOOL)-$(GTLANG).txt: @echo @echo "*** Preparing corpus test - $(GTLANG) ***" ; @echo # If $DOC is the reserved name (see top of this file), collect all correct docs: ifeq "x$(DOC)" "xtmp/GoldstandardTexts.txt" @echo "*** on ALL stable correct-texts ***" ; @echo ccat -l $(GTLANG) -a -S -ort -C -r $$GTFREE/stable/goldstandard/converted/$(GTLANG)/ \ | LOCALE=C sort \ | $(SCRIPTDIR)/spell-preprocess.pl \ > $@.tmp # If $DOC is the reserved name (see top of this file), collect all correct docs: else ifeq "x$(BDOC)" "xtmp/GoldstandardBoundTexts.txt" @echo "*** on ALL stable correct-texts in $$GTBOUND ***" ; @echo # Make sure the dir exists before attempting to do further work: @if [ ! -d "$$GTBOUND/stable/goldstandard/converted/$(GTLANG)/" ] ; then \ echo ; \ echo "*** There is no stable goldstandard dir for $(GTLANG)! ***" ; \ echo "*** Bailing out. ***" ; \ echo ; \ exit 1 ; \ fi ccat -l $(GTLANG) -a -S -ort -C -r $$GTBOUND/stable/goldstandard/converted/$(GTLANG)/ \ | LOCALE=C sort \ | $(SCRIPTDIR)/spell-preprocess.pl \ > $@.tmp # Make sure we got some content, if not - bail out: @if [ ! -s "$@.tmp" ] ; then \ echo ; \ echo "*** No useful content in $$GTBOUND! ***" ; \ echo "*** Bailing out. ***" ; \ echo ; \ exit 1 ; \ fi # else use the specified $DOC only else @echo "*** on $(DOC) only ***" ; @echo ccat -l $(GTLANG) -a -S -ort -C $(DOC) \ | LOCALE=C sort \ | $(SCRIPTDIR)/spell-preprocess.pl \ > $@.tmp endif @echo "nuvviDspeller Divvun" | cat - $@.tmp > $@ @cut -f1 $@ > $@.c1 @rm -f $@.tmp ##################################### # Regression-test input preprocessing: tmp/sp-regression-$(TESTTOOL)-$(GTLANG).txt: \ $(GTLANG)/$(SPTOOLDIR)/regression.txt @echo @echo "*** Preparing regression test - $(GTLANG) ***" ; @echo @echo "nuvviDspeller Divvun" | cat - $< | \ grep -v '^[[:alnum:]]* ' | \ grep -v '^#' | \ grep -v '^$$' > $@ @cut -f1 $@ > $@.c1 ##################################### # Baseform-test input preprocessing: tmp/sp-baseform-$(TESTTOOL)-$(GTLANG).txt: tmp/$(GTLANG)-baseforms.txt @echo @echo "*** Spell-checking all baseforms in the $(GTLANG) lexicon. ***" ; @echo @rm -f $@ @rm -f $@.tmp @rm -f $@.unrec @touch userdict @$(SPELL) -u8 -0 -m $(GTLANG)/polderland/$(MACSPELL) < $< > $@.tmp @rm -f userdict @echo @echo "*** Extracting all unrecognised baseforms - $(GTLANG) ***" ; @echo @grep '^Gett' $@.tmp | cut -d ' ' -f4 | perl -pe 's/\.\.\.//' > $@.unrec @grep 'Check returns' $@.tmp | cut -d '@' -f2 | cut -d "'" -f2 >> $@.unrec @sort -u $@.unrec | perl -ple 's/^(.*)$$/$$1\t/;' > $@.tmp @echo "nuvviDspeller Divvun" | cat - $@.tmp > $@ @cut -f1 $@ > $@.c1 # Check that all baseforms are recognised by the normative transducer norm-selftest: tmp/$(GTLANG)-norm-unrec.txt tmp/$(GTLANG)-norm-unrec.txt: tmp/$(GTLANG)-baseforms.txt \ $(GTLANG)/bin/$(GTLANG)-norm.fst @echo @echo "*** Analyzing all baseforms in the $(GTLANG) lexicon. ***" ; @echo @lookup -q -flags mbTT -utf8 $(GTLANG)/bin/$(GTLANG)-norm.fst \ < $< | grep '\?' | cut -f1 > $@ @echo @echo "*** Ready. Unrecognised baseforms in $@ ***" ; @echo wc -l $@ # Extract all baseforms from the LexC files: baseforms: tmp/$(GTLANG)-baseforms.txt tmp/$(GTLANG)-baseforms.txt: $(LXC2SPELLDIR)/src/Baseforms.class \ $(SRCS) @echo @echo "*** Building list of baseforms. ***" ; @echo @rm -f $@ @rm -f $@.tmp @for file in $(SRCS) ; do \ java -cp $(LXC2SPELLDIR)/build Baseforms \ $$file >> $@.tmp ; \ done @sort -u $@.tmp | tr -d '\t' > $@ # Build the baseform extraction tool: $(LXC2SPELLDIR)/src/Baseforms.class: @echo @echo "*** Building baseform extraction tool. ***" ; @echo ant -buildfile $(LXC2SPELLDIR)/build.xml ##################################### # Paradigm testing input processing: # Regression-test input preprocessing: paradigms: tmp/sp-paradigm-$(TESTTOOL)-$(GTLANG).txt tmp/sp-paradigm-$(TESTTOOL)-$(GTLANG).txt: \ $(GTLANG)/testing/paradigm-$(GTLANG).txt @echo @echo "*** Preparing paradigm test - $(GTLANG) ***" ; @echo @perl $(SCRIPTDIR)/paradigms2speller.pl $< > $@.tmp @echo "nuvviDspeller Divvun" | cat - $@.tmp > $@ @rm -f $@.tmp @cut -f1 $@ > $@.c1 $(GTLANG)/testing/paradigm-$(GTLANG).txt: \ $(GTLANG)/testing/gen-paradigms.sh \ $(GTLANG)/testing/gen-paradigms.pl \ $(GTLANG)/testing/parawlist.txt \ $(GTLANG)/testing/adj-codes.txt \ $(GTLANG)/testing/noun-codes.txt \ $(GTLANG)/testing/num-codes.txt \ $(GTLANG)/testing/prop-codes.txt \ $(GTLANG)/testing/verb-codes.txt \ $(GTLANG)/bin/i$(GTLANG)-norm.fst @echo @echo "*** Generating paradigms - $(GTLANG) ***" ; @echo @cd $(GTLANG)/testing/ && ./gen-paradigms.sh ##################################### # Wordform-test input preprocessing: tmp/sp-wordtype-$(TESTTOOL)-$(GTLANG).txt: \ $(GTLANG)/testing/speller-testbed-$(GTLANG).txt @echo @echo "*** Preparing wordtype test - $(GTLANG) ***" ; @echo @echo "nuvviDspeller Divvun" | cat - $< | \ grep -v '^[[:alnum:]]* ' | \ grep -v '^#' | \ grep -v '^$$' > $@ # Extract the second column, and add it as correct input: @grep -v '^#' $< | \ grep -v '^$$' | \ cut -f2 | \ tr ' ' '\n' | \ grep -v '^.$$' | \ perl -ple 's/^(.*)$$/$$1\t/;' >> $@ # Extract the first column to use as input for spell checker: @cut -f1 $@ > $@.c1 ##################################### # Hyphenation testing starts here: hyphtest: hyphregression hyphwordtypes hyphregression: \ $(HYPTESTREPDIR)/regression-$(TESTTOOL)-forrest-$(GTLANG)-$(DATE)-$(TESTTIME).xml hyphwordtypes: \ $(HYPTESTREPDIR)/wordtypes-$(TESTTOOL)-forrest-$(GTLANG)-$(DATE)-$(TESTTIME).xml ##################################### # The final target is a Forrest doc containing the results and some calculated # statistics: $(HYPTESTREPDIR)/%-$(TESTTOOL)-forrest-$(GTLANG)-$(DATE)-$(TESTTIME).xml: \ $(HYPTESTREPDIR)/%-$(TESTTOOL)-$(GTLANG)-$(DATE)-$(TESTTIME).xml \ $(TESTXSLDIR)/hyphtest2document.xsl @echo @echo "*** Converting to ForrestDoc. Output in: ***" ; @echo @echo "$@" ; @echo @xsltproc \ --param testlang "'$(GTLANG)'" \ --param testtype "'$*'" \ $(TESTXSLDIR)/hyphtest2document.xsl $< | \ xmllint --encode UTF-8 --output $@ --format - ##################################### # Convert the test results to a simple but sufficient xml format: .PRECIOUS: $(HYPTESTREPDIR)/%-$(TESTTOOL)-$(GTLANG)-$(DATE)-$(TESTTIME).xml $(HYPTESTREPDIR)/%-$(TESTTOOL)-$(GTLANG)-$(DATE)-$(TESTTIME).xml: \ tmp/hy-%-$(TESTTOOL)-$(GTLANG).txt \ tmp/hy-%-$(TESTTOOL)-$(GTLANG).txt.out @echo @echo "*** Collecting $* results, transforming to XML. Output: ***" ; @echo @echo "$@" ; @echo $(SCRIPTDIR)/hyphen-testres.pl \ --$(TESTTOOL) \ --input=$< \ --output=$<.out \ --document=$< \ --date=$(DATE) \ --version="`cat $<.out.version`" \ --toolversion="`cat tmp/hyph-$(TESTTOOL)-version.txt`" \ --xml=$@ ##################################### # Run the actual hyphenation test here: .PRECIOUS: tmp/hy-%-pl-$(GTLANG).txt.out tmp/hy-%-pl-$(GTLANG).txt.out: tmp/hy-%-pl-$(GTLANG).txt @echo @echo "*** Hyphenation testing - $(GTLANG) ***" ; @echo cat $<.f1 | $(HYPHEN) -w -m $(GTLANG)/polderland/$(HYPHPATT) > $@ # Add lexicon signature/version here @echo "nuvviDspeller Divvun" > $@.verstmp @touch userdict $(SPELL) -u8 -0 -m $(GTLANG)/polderland/$(HYPHDIC) < $@.verstmp \ > $@.verstmp2 grep 'version' $@.verstmp2 | cut -f2 > $@.version @rm -f userdict $@.verstmp $@.verstmp2 # Add hyphenator tool version here $(HYPHEN) --version 2> tmp/hyph-$(TESTTOOL)-version.txt ##################################### # Prepare the hyphenation data - # Wordtypes data (check hyphenation of a number of different word constructs): tmp/hy-wordtypes-$(TESTTOOL)-$(GTLANG).txt: \ $(GTLANG)/testing/hyphenation.txt \ $(GTLANG)/polderland/$(HYPHPATT) # prepare-hyphentest @echo @echo "*** Preparing hyphenation test data - $(GTLANG) ***" ; @echo grep -v "^#" $< | grep -v "^\s*$$" | grep -v ' .* ' > $@ cut -f1 $@ | tr -d '^' > $@.f1 ##################################### # Prepare the hyphenation data - # Regression data: tmp/hy-regression-$(TESTTOOL)-$(GTLANG).txt: \ $(GTLANG)/polderland/hyph-regressions.txt \ $(GTLANG)/polderland/$(HYPHPATT) \ prepare-hyphentest @echo @echo "*** Preparing hyphenation test data - $(GTLANG) ***" ; @echo grep -v "^#" $< | grep -v "^\s*$$" | grep -v ' .* ' > $@ cut -f1 $@ | tr -d '^' > $@.f1 ##################################### # Make sure the latest lexicon hyphenation file is used: prepare-hyphentest: @echo @echo "*** Copying hyphenation lexicon file - $(GTLANG) ***" ; @echo if [ -f $(SPELLERLEX) ] ; then \ cp $(SPELLERLEX) $(GTLANG)/polderland/$(HYPHDIC) ; \ else \ echo "No speller lexicon found. Please do: cd $$GTHOME/prooftools/ && make mslex-download" ;\ fi ##################################### # Make sure the latest pattern hyphenation file is used: $(GTLANG)/polderland/$(HYPHPATT): $(PROOFDISTSHARED)/$(GTLANG)/hyph/$(HYPHPATT) @echo @echo "*** Copying hyphenation pattern lexicon - $(GTLANG) ***" ; @echo cp $(PROOFDISTSHARED)/$(GTLANG)/hyph/$(HYPHPATT) \ $(GTLANG)/polderland/$(HYPHPATT)