import java.io.*; import java.util.Vector; import java.util.Iterator; import java.util.StringTokenizer; import no.divvun.Analyzer.Objects.*; import no.divvun.Analyzer.Client.*; import no.divvun.Analyzer.Communication.Response; import no.divvun.Analyzer.Communication.Parameters; public class Lexc2xspell implements DivvunRequestListener { DivvunBasicServices srv; ReadLexc readLexc; /** * @param args */ public static void main(String[] args) throws IOException { Lexc2xspell xspell = new Lexc2xspell(args); } public Lexc2xspell(String[] args) throws IOException { if (args.length < 2) { System.out.println("Too few parameters"); return; } String filein = args[0]; int POS = 0; int LANG = 0; if (filein.contains("sme-lex")) LANG = LexcOptions.SME; else if (filein.contains("smj-lex")) LANG = LexcOptions.SMJ; else if (filein.contains("sma-lex")) LANG = LexcOptions.SMA; else { System.out.println("Wrong type of file: Only sm[aej]-lex files accepted"); return; } if (filein.contains("conjunction-sm")) POS = LexcOptions.CONJUNCTION; else if (filein.contains("subjunction-sm")) POS = LexcOptions.SUBJUNCTION; else if (filein.contains("propernoun-sm")) POS = LexcOptions.PROPERNOUN; else // pronoun has to be above noun-sme, otherwise it is assigned as NOUN if (filein.contains("pronoun-sm")) POS = LexcOptions.PRONOUN; else if (filein.contains("pp-sm")) POS = LexcOptions.ADPOSITION; else if (filein.contains("verb-sm")) POS = LexcOptions.VERB; else if (filein.contains("adj-sm")) POS = LexcOptions.ADJECTIVE; else if (filein.contains("adv-sm")) POS = LexcOptions.ADVERB; else if (filein.contains("acro-sm")) POS = LexcOptions.ACRO; else if (filein.contains("abbr-sm")) POS = LexcOptions.ABBR; else if (filein.contains("numeral-sm")) POS = LexcOptions.NUMERAL; else if (filein.contains("num-sm")) POS = LexcOptions.NUM; else if (filein.contains("particle-sm")) POS = LexcOptions.PARTICLE; else if (filein.contains("interjection-sm")) POS = LexcOptions.INTERJECTION; readLexc = new ReadLexc (filein, LANG, POS); srv = null; try { // srv = DivvunRequestManager.newDivvunBasicServices(this, "129.242.220.113", 8082); // srv = DivvunRequestManager.newDivvunBasicServices(this, "hum-tf4-ans142.hum.uit.no", 8086); // srv = DivvunRequestManager.newDivvunBasicServices(this, "victorio.uit.no", 8080); srv = DivvunRequestManager.newDivvunBasicServices(this, "divvun.no", 8089); } catch(Exception e) { System.out.println(e.getMessage()); } if (srv != null) { Parameters param = new Parameters(); param.setLang(LexcOptions.getOption(LANG)); param.setFst(args[1]); param.setGrammar(args[2]); param.setTags(args[3]); param.setParadigm(true); param.setHyphenate(true); param.setGenerate(true); param.setUseFilter(true); param.setXml_in(true); param.setXml_out(true); srv.initServer(param); Iterator iterator = readLexc.getEntries().iterator(); Entry entry = null; Derivation der = null; Paradigm paradigm = null; Paradigm derpara = null; String word = null; String ordinal = null; Vector vProcessedWords = new Vector(); while(iterator.hasNext()) { entry = iterator.next(); if (entry.getWord().contains("+A+Ord")) ordinal = "A"; if (entry.getWord().contains("+N+Coll")) ordinal = "N+Coll"; if (entry.getPOS() != LexcOptions.NUM) { word = cleanWord(entry.getWord()); entry.setWord(word); } // System.err.println(entry.getWord()); if (!vProcessedWords.contains(entry.getWord())) { if (POS != LexcOptions.MIDDLE_NOUN) { if (ordinal == null) { paradigm = srv.getFullParadigm(entry.getWord(), entry.getsPOS()); if (entry.getsPOS().equals("Po") && paradigm == null) paradigm = srv.getFullParadigm(entry.getWord(), "Pr"); if (POS == LexcOptions.NUM) { entry.addParadigm(paradigm); paradigm = srv.getFullParadigm(entry.getWord(), "A+Ord"); entry.addParadigm(paradigm); paradigm = srv.getFullParadigm(entry.getWord(), "N+Coll"); } } else paradigm = srv.getFullParadigm(entry.getWord(), ordinal); vProcessedWords.add(entry.getWord()); entry.addParadigm(paradigm); String buffer = PrintEntries.printPlx(entry); System.out.print(buffer); } /* Iterator deriter = readLexc.getDerivations().iterator(); while(deriter.hasNext()) { der = deriter.next(); if ((!der.getType().endsWith("V") && !der.getType().endsWith("A")) && der.getType().startsWith(entry.getsPOS())) { derpara = srv.getFullParadigm(word + "+" + entry.getsPOS() + der.getDer(), der.getType().substring(1,2)); if (derpara != null) { if (entry.getParadigm() == null) entry.setParadigm(new Paradigm()); for (int i = 0; i < derpara.size(); i++) entry.getParadigm().add(derpara.get(i)); } // entry.setParadigm(derpara); } } if (entry.getsPOS().equalsIgnoreCase("N") || entry.getsPOS().equalsIgnoreCase("A")) entry = getCompoundStem(entry); */ // if (entry.getPOS() == LexcOptions.MIDDLE_NOUN) { // System.out.println(word + "\t" + entry.getInfl()); else { if (entry.getInfl().equalsIgnoreCase("ProperNoun")) System.out.println(word + "\tNeP#"); else if (entry.getInfl().equalsIgnoreCase("AdjectiveRoot")) System.out.println(word + "\tJ#,Ja#,Jp#,Jn#"); else if (entry.getInfl().equalsIgnoreCase("NounRoot")) { System.out.println(word + "\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#"); System.out.println(word + "-\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#"); } else if (entry.getInfl().equalsIgnoreCase("Rreal")) { System.out.println(word + "\tNO"); System.out.println(word + "-\tNOA"); } else if (entry.getInfl().equalsIgnoreCase("Rmiddle")) { System.out.println(word + "\tNO"); System.out.println(word + "-\tNOA"); } else if (entry.getInfl().equalsIgnoreCase("Rnoun")) { System.out.println(word + "\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#"); System.out.println(word + "-\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#,NI"); } else if (entry.getInfl().equalsIgnoreCase("RAbbr")) { System.out.println(word + "\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#,NBO"); } else if (entry.getInfl().equalsIgnoreCase("R")) { System.out.println(word + "\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#,NBO"); System.out.println(word + "-\tN#,Na#,Np#,Nn#,Ga#,Gp#,Gn#,NIBOA"); } else if (entry.getInfl().equalsIgnoreCase("VerbRoot")) System.out.println(word + "\tV#"); } // else { // entry = getHyphenation(entry); // String buffer = PrintEntries.printPlx(entry); // System.out.print(buffer); // } entry.setParadigm(null); ordinal = null; } } srv.disconnect(); } } public Entry getCompoundStem(Entry entry) { String word = entry.getWord(); Generation gen; if ((gen = srv.generateWordform(word + "+" + entry.getsPOS() + "+SgCmpmuorra+N+Sg+Nom")) != null) { for (int i = 0; i < gen.size(); i++) { if (entry.getParadigm() == null) { entry.setParadigm(new Paradigm()); } String comp = gen.getWord(i); comp = comp.replace("muor^ra", ""); Paradigm paradigm = entry.getParadigm(); Element element = new Element(new Reading(Reading.Type.ANALYSIS, "+SgCmp")); element.getReading().setNext(new Reading(Reading.Type.FORM, comp)); paradigm.add(element); entry.setParadigm(paradigm); System.out.println(); } } if ((gen = srv.generateWordform(word + "+" + entry.getsPOS() + "+SgNomCmpmuorra+N+Sg+Nom")) != null) { for (int i = 0; i < gen.size(); i++) { if (entry.getParadigm() == null) { entry.setParadigm(new Paradigm()); } String comp = gen.getWord(i); comp = comp.replace("muor^ra", ""); Paradigm paradigm = entry.getParadigm(); Element element = new Element(new Reading(Reading.Type.ANALYSIS, "+SgNomCmp")); element.getReading().setNext(new Reading(Reading.Type.FORM, comp)); paradigm.add(element); entry.setParadigm(paradigm); } } if ((gen = srv.generateWordform(word + "+" + entry.getsPOS() + "+SgGenCmpmuorra+N+Sg+Nom")) != null) { for (int i = 0; i < gen.size(); i++) { if (entry.getParadigm() == null) { entry.setParadigm(new Paradigm()); } String comp = gen.getWord(i); comp = comp.replace("muor^ra", ""); Paradigm paradigm = entry.getParadigm(); Element element = new Element(new Reading(Reading.Type.ANALYSIS, "+SgGenCmp")); element.getReading().setNext(new Reading(Reading.Type.FORM, comp)); paradigm.add(element); entry.setParadigm(paradigm); } } if ((gen = srv.generateWordform(word + "+" + entry.getsPOS() + "+PlGenCmpmuorra+N+Sg+Nom")) != null) { for (int i = 0; i < gen.size(); i++) { if (entry.getParadigm() == null) { entry.setParadigm(new Paradigm()); } String comp = gen.getWord(i); comp = comp.replace("muor^ra", ""); Paradigm paradigm = entry.getParadigm(); Element element = new Element(new Reading(Reading.Type.ANALYSIS, "+PlGenCmp")); element.getReading().setNext(new Reading(Reading.Type.FORM, comp)); paradigm.add(element); entry.setParadigm(paradigm); } } if ((gen = srv.generateWordform("miella+N+Sg+Gen" + word + "+Cmpnd")) != null) { for (int i = 0; i < gen.size(); i++) { if (entry.getParadigm() == null) { entry.setParadigm(new Paradigm()); } String comp = gen.getWord(i); comp = comp.replace("mie^la^", ""); Paradigm paradigm = entry.getParadigm(); Element element = new Element(new Reading(Reading.Type.ANALYSIS, "+ShCmp")); element.getReading().setNext(new Reading(Reading.Type.FORM, comp)); paradigm.add(element); entry.setParadigm(paradigm); } } return entry; } public Entry getHyphenation(Entry entry) { String word = null; Paradigm paradigm = null; word = entry.getWord(); entry.setWord(srv.hyphenateWord(word, "")); paradigm = entry.getParadigm(); if(paradigm != null) { for(int i = 0; i < paradigm.size(); i++) { word = paradigm.get(i).getReading().getNext().getValue(); word = srv.hyphenateWord(word, ""); paradigm.get(i).getReading().getNext().setValue(word); } entry.setParadigm(paradigm); } return entry; } public void addParadigm() { } private String cleanWord(String word) { StringTokenizer st = new StringTokenizer(word); String clean; if (word.contains("+")) clean = st.nextToken("+"); else clean = st.nextToken(":"); clean = clean.replaceAll("#", ""); clean = clean.replace("^", ""); clean = clean.replaceAll("0", ""); clean = clean.replaceAll("%", ""); clean = clean.replaceAll(">", ""); return clean; } /* (non-Javadoc) * @see no.divvun.Analyzer.Client.DivvunRequestListener#requestCompleted(no.divvun.Analyzer.Communication.Response) */ public void requestCompleted(Response response) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see no.divvun.Analyzer.Client.DivvunRequestListener#requestStatusChanged(no.divvun.Analyzer.Client.DivvunRequestStatus) */ public void requestStatusChanged(DivvunRequestStatus divvunrequeststatus) { // TODO Auto-generated method stub } }