1. XML conversion validity test before conversion: find orig/nob -name "*" -type f | grep -v '.svn' | grep -v "xsl" > object_files_nob.txt find orig/sme -name "*" -type f | grep -v '.svn' | grep -v "xsl" > object_files_sme.txt find orig/nob -name "*" -type f | grep -v '.svn' | grep ".xsl" > meta_files_nob.txt find orig/sme -name "*" -type f | grep -v '.svn' | grep ".xsl" > meta_files_sme.txt wc -l *_nob.txt wc -l *_sme.txt perl -ne 's/^(.*)(\.xsl)$/$1/; print' meta_files_nob.txt > meta-object_files_nob.txt perl -ne 's/^(.*)(\.xsl)$/$1/; print' meta_files_sme.txt > meta-object_files_sme.txt diff meta-object_files_nob.txt object_files_nob.txt diff meta-object_files_sme.txt object_files_sme.txt DELVIS dokumentasjon for punkt 1, 2, 3: http://giellatekno.uit.no/doc/ling/ParallelCorpusConversion.html Konvertere alle orig filer til xml: convert2xml.pl --orig gir xml-filer i converted 2. parallelity tests Finne parallelle filer: Delvis automatisk, delvis manuelt. main/gt/script/corpus/parallel_corpus_info.xsl 3. usability tests pick-parallel-doc.pl filename Kopierer parallelle nob-sme filpar fra converted til prestable/converted gitt at dokumentene inneholder minst 30 ord og forholdet mellom samisk og norsk tekst er mellom 73 og 110 prosent. For å få output av alle filer som blir oversett: find $GTFREE/converted/sme -name '*.xml' -exec pick-parallel-docs.pl {} \; 2> filersomikkebleplukketut.txt 4. sentence alignment Sentence alignment: corpus-parallel.py -> gir toktmx Pynte toktmx: toktmx2tmx.py -> gir tmx 5. morphosyntactic analysis 5.1 extraction java -Dfile.encoding=UTF8 -Xmx2048m net.sf.saxon.Transform -it main filter_data.xsl inDir=.../prestable/tmx/nob2sme cat out_nob/* > data.nob cat out_sme/* > data.sme 5.2 analysis 5.2.1 apertium A_NN_NB="$HOME/path/to/apertium-nn-nb" A_SME_NOB="$HOME/path/to/apertium-sme-nob" 5.2.1.1. sme: 5.2.1.1.1. analysis cat data.sme | apertium-destxt | hfst-proc -w $A_SME_NOB/sme-nob.automorf.hfst.ol | cg-proc -d $A_SME_NOB/sme-nob.rlx.bin |\ apertium-tagger -g $A_SME_NOB/sme-nob.prob > data.tagged.sme echo "tagged sme data in data.tagged.sme" 5.2.1.1.2. cleanup for GIZA/MOSES cat data.tagged.sme | perl -pe 's/<\@[^>]+>//g;' | process-tags.py sme.process-relabel > data.tagged.clean.sme 5.2.1.2. nob: 5.2.1.2.1. analysis cat data.nob | apertium-destxt | lt-proc -w -e $A_NN_NB/nb-nn.automorf.bin | cg-proc -d $A_NN_NB/nb-nn.rlx.bin |\ apertium-tagger -g $A_NN_NB/nb-nn.prob > data.tagged.nob echo "tagged nob data in data.tagged.nob" 5.2.1.2.2. cleanup for GIZA/MOSES cat data.tagged.nob | process-tags.py nob.process-relabel > data.tagged.clean.nob 5.2.2 gt/obt 5.2.2.1. sme: USER='cipriangerstenberger' GTHOME="/Users/$USER/main" ABBR="$GTHOME/gt/sme/bin/abbr.txt" DIS="$GTHOME/gt/sme/src/sme-dis.rle" USME="lookup -q -flags mbTT $GTHOME/gt/sme/bin/sme.fst" IN_DATA='data.sme' OUT_1='xText_01.sme' OUT_2='total_out.sme' cat $IN_DATA | PERL_UNICODE=D perl -p -e 's/\n/ £ \n/g' | \ preprocess --abbr=$ABBR > $OUT_1 cat $OUT_1 | $USME | lookup2cg | vislcg3 -g $DIS > $OUT_2 cat $OUT_2 | PERL_UNICODE=S perl robust_disambiguation_sme.pl > data.tagged.clean.sme 5.2.2.2. nob: cat data.nob | tr '\n' '£' | sed 's/£/ £ ™/g' | tr '™' '\n' | \ ~/main/st/nob/obt/bin/mtag-osx64 | \ vislcg3 -g ~/main/st/nob/obt/src/nob_morf-prestat.cg3 > data.tagged.shitty.nob cat data.tagged.shitty.nob | PERL_UNICODE=S perl robust_disabmiguation_nob.pl > data.tagged.clean.nob 6. word alignment ==> TODO