import java.io.*; import java.util.HashMap; import java.util.Iterator; import java.util.StringTokenizer; import java.util.Vector; import no.divvun.Analyzer.Objects.Entry; import no.divvun.Analyzer.Objects.LexcOptions; public class ReadLexc { private int iLang; private int iPOS; private int iDer; private boolean bCreateEntry; private boolean numimprecise; private boolean bMiddleNoun; private Vector vDerivations = new Vector(); private Vector vEntries = new Vector(); public ReadLexc (String aFileName, int aLang, int aPOS) throws IOException { String line; BufferedReader filein; iLang = aLang; iPOS = aPOS; iDer = -1; try { InputStream is = new FileInputStream(aFileName); filein = new BufferedReader (new InputStreamReader(is)); while ((line = filein.readLine()) != null) { int linelen = line.length(); readLine(line); } } catch (FileNotFoundException e) { System.out.print(e.getMessage()); } String dirname = aFileName.substring(0, aFileName.lastIndexOf("/")+1); String lexfile = dirname.concat(LexcOptions.getOption(iLang) + "-lex.txt"); System.err.println(lexfile); // addDerivations(lexfile); // PrintEntries printEntries = new PrintEntries(vEntries); // printEntries.printAspell(); } public Vector getEntries() { return vEntries; } public Vector getDerivations() { return vDerivations; } public void addDerivations(String aFileName) throws IOException { String line; Boolean cont = true; BufferedReader filein; HashMap derivs = new HashMap(); try { InputStream is = new FileInputStream(aFileName); filein = new BufferedReader (new InputStreamReader(is)); while ((line = filein.readLine()) != null && cont) { if (line.contains("Der#begin")) iDer = -1; else if (line.contains("Der#1")) iDer = 1; else if (line.contains("Der#2")) iDer = 2; else if (line.contains("Der#3")) iDer = 3; else if (line.contains("Der#4")) iDer = 4; else if (line.contains("Der#other")) iDer = 0; else if (line.contains("Der#end")) cont = false; else if (iDer > -1 && line.contains("+")) { StringTokenizer st = new StringTokenizer(line); String der = st.nextToken(); String type; if (st.hasMoreTokens()) { st.nextToken(); type = st.nextToken(); } else type = ""; String position; if (line.contains("+Der1+Der2")) position = "+Der1+Der2"; else if (line.contains("Actor")) position = "+Der1+Der2"; else position = new String("+Der" + Integer.toString(iDer)); Derivation derivation = new Derivation(der, position, type); vDerivations.add(derivation); } } } catch (FileNotFoundException e) { System.out.print(e.getMessage()); } Vector tmp = new Vector(); Iterator iter = vDerivations.iterator(); while (iter.hasNext()) { Derivation der = iter.next(); Iterator iter2 = vDerivations.iterator(); while(iter2.hasNext()) { Derivation der2 = iter2.next(); //System.err.println(der.getDer() + der2.getDer()); String pos = der.getPosition(); String pos2 = der2.getPosition(); //System.err.println(pos + "\t" + pos2); if (!pos.equalsIgnoreCase("+Der0") && (pos.substring(pos.lastIndexOf("+")).compareTo(pos2.substring(0,5)) < 0 && der.getType().endsWith(der2.getType().substring(0,1)))) { String derstring = new String(der.getDer() + "+" + der.getType().substring(1,2) + der2.getDer()); Derivation derivation = new Derivation(derstring, "+Der0", der.getType().substring(0,1) + der2.getType().substring(1,2)); tmp.add(derivation); /* System.out.print(der.getDer()); System.out.print("+" + der.getType().substring(1,2)); System.out.print(der2.getDer()); System.out.println("+" + der2.getType().substring(1,2)); */ if ((pos.equalsIgnoreCase("+Der1") && pos2.equalsIgnoreCase("+Der2")) || (pos.equalsIgnoreCase("+Der1") && pos2.equalsIgnoreCase("+Der3")) || (pos.equalsIgnoreCase("+Der2") && pos2.equalsIgnoreCase("+Der3"))) { Iterator iter3 = vDerivations.iterator(); while (iter3.hasNext()) { Derivation der3 = iter3.next(); String pos3 = der3.getPosition(); if (((der3.getPosition().equalsIgnoreCase("+Der3") && !pos2.equalsIgnoreCase("+Der3")) || der3.getPosition().equalsIgnoreCase("+Der4")) && derivation.getType().endsWith(der3.getType().substring(0,1))) { derstring = new String(derivation.getDer() + "+" + derivation.getType().substring(1,2) + der3.getDer()); Derivation derivation2 = new Derivation(derstring, "+Der0", derivation.getType().substring(0,1) + der3.getType().substring(1,2)); tmp.add(derivation2); if (pos.equalsIgnoreCase("+Der1") && pos2.equalsIgnoreCase("+Der2") && pos3.equalsIgnoreCase("+Der3")) { Iterator iter4 = vDerivations.iterator(); while (iter4.hasNext()) { Derivation der4 = iter4.next(); if (der4.getPosition().equalsIgnoreCase("+Der4") && derivation.getType().endsWith(der4.getType().substring(0,1))) { derstring = new String(derivation.getDer() + "+" + derivation.getType().substring(1,2) + der4.getDer()); Derivation derivation3 = new Derivation(derstring, "+Der0", derivation.getType().substring(0,1) + der4.getType().substring(1,2)); tmp.add(derivation3); } if (der4.getPosition().equalsIgnoreCase("+Der4") && derivation2.getType().endsWith(der4.getType().substring(0,1))) { derstring = new String(derivation2.getDer() + "+" + derivation2.getType().substring(1,2) + der4.getDer()); Derivation derivation3 = new Derivation(derstring, "+Der0", derivation2.getType().substring(0,1) + der4.getType().substring(1,2)); tmp.add(derivation3); } } } } } } } } } vDerivations.addAll(tmp); /* Debug Iterator test = vDerivations.iterator(); while (test.hasNext()) { Derivation dertest = test.next(); if (dertest.getType().startsWith("V")) // && !dertest.getType().endsWith("V")) System.out.println(dertest.getDer()); } System.exit(0); */ } private void readLine(String aLine) { // System.out.println(aLine); // System.out.println(iPOS); if (aLine.startsWith("LEXICON")) { if(((iPOS == LexcOptions.NOUN) && aLine.endsWith("NounRoot")) || ((iPOS == LexcOptions.VERB)) || ((iPOS == LexcOptions.ADJECTIVE) && aLine.endsWith("AdjectiveRoot")) || ((iPOS == LexcOptions.PROPERNOUN) && aLine.endsWith("ProperNoun")) || ((iPOS == LexcOptions.ADVERB) && aLine.endsWith("Adverb")) || ((iPOS == LexcOptions.PRONOUN)) || ((iPOS == LexcOptions.ADPOSITION)) || ((iPOS == LexcOptions.ACRO)) || ((iPOS == LexcOptions.ABBR)) || ((iPOS == LexcOptions.NUM)) || ((iPOS == LexcOptions.NUMERAL)) || ((iPOS == LexcOptions.PARTICLE)) || ((iPOS == LexcOptions.CONJUNCTION)) || ((iPOS == LexcOptions.SUBJUNCTION)) || ((iPOS == LexcOptions.INTERJECTION)) || ((iPOS == LexcOptions.MIDDLE_NOUN) && (aLine.contains("Rmiddle") || aLine.contains("Prefixes") || aLine.endsWith("NounRoot") || aLine.contains("FirstComponent"))) ){ if (aLine.endsWith("num-imprecise") || aLine.endsWith("11TO99F") || aLine.endsWith("2TO9LOG")) numimprecise = true; else numimprecise = false; if (aLine.contains("Rmiddle")) bMiddleNoun = true; else bMiddleNoun = false; bCreateEntry = true; } else bCreateEntry = false; } else if (bCreateEntry) { aLine = aLine.trim(); // if ((!aLine.equals("") || (!aLine.contains("^C^") || !aLine.contains("SUB") ) ) && (!aLine.startsWith("!") && !aLine.startsWith(" ") )) if (iPOS == LexcOptions.MIDDLE_NOUN && !(aLine.contains("+Err/Sub") || aLine.equals("") || aLine.startsWith("!"))) createEntry(aLine); else if (aLine.contains("+Use/Circ") || aLine.contains("+Err/Sub") || aLine.equals("") || aLine.startsWith("!") || aLine.startsWith(" ") || aLine.startsWith("+") || aLine.contains("+Use/CircN") || aLine.startsWith("\t") || aLine.startsWith(":") || aLine.startsWith("<")) ; else /*if ((iPOS==LexcOptions.NUMERAL ^ !aLine.contains("+Ord")) || numimprecise) */ createEntry (aLine); } } private void createEntry(String aLine) { Entry entry; StringTokenizer st; String word; String infl = ""; String tmp; String cmp = new String(""); // int inflCode; // System.err.println(aLine); st = new StringTokenizer(aLine); word = st.nextToken(); while (word.endsWith("%")) { word = word.replaceAll("%", " "); word = word.concat(st.nextToken()); } if (st.hasMoreTokens()) infl = st.nextToken(); if (bMiddleNoun) infl = new String("Rmiddle"); if (aLine.contains("!+")) { do { tmp = st.nextToken(); if (tmp.contains("+")) cmp = cmp.concat(tmp); } while (st.hasMoreTokens()); cmp = cmp.replaceAll("!", ""); } /* inflCode = LexcOptions.getCode(infl); if (inflCode == -1) System.err.println("Inflection " + infl + " not found in LexcOptions!"); */ if (cmp.equalsIgnoreCase("")) entry = new Entry(word, iLang, iPOS, infl); else entry = new Entry(word, iLang, iPOS, infl, cmp); if (!vEntries.add(entry)) { System.err.println("Failed to add word " + word + " to Vector!"); } } }