import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; import no.divvun.Analyzer.Objects.Entry; import no.divvun.Analyzer.Objects.LexcOptions; public class Baseforms { /** * @param args */ public static void main(String[] args) throws IOException{ Baseforms bf = new Baseforms(args); } public Baseforms(String[] args) throws IOException { // TODO Auto-generated method stub ReadLexc readLexc; String filein = args[0]; int POS = 0; int LANG = 0; if (filein.contains("sme")) LANG = LexcOptions.SME; else if (filein.contains("smj")) LANG = LexcOptions.SMJ; else if (filein.contains("sma")) LANG = LexcOptions.SMA; else { System.out.println("Wrong type of file: Only sm[aej] files accepted"); return; } if (filein.contains("conjunction-sm")) POS = LexcOptions.CONJUNCTION; else if (filein.contains("subjunction-sm")) POS = LexcOptions.SUBJUNCTION; else if (filein.contains("propernoun-sm")) POS = LexcOptions.PROPERNOUN; else // pronoun has to be above noun-sme, otherwise it is assigned as NOUN if (filein.contains("pronoun-sm")) POS = LexcOptions.PRONOUN; else if (filein.contains("pp-sm")) POS = LexcOptions.ADPOSITION; else if (filein.contains("noun-sm")) POS = LexcOptions.NOUN; else if (filein.contains("verb-sm")) POS = LexcOptions.VERB; else if (filein.contains("adj-sm")) POS = LexcOptions.ADJECTIVE; else if (filein.contains("adv-sm")) POS = LexcOptions.ADVERB; else if (filein.contains("acro-sm")) POS = LexcOptions.ACRO; else if (filein.contains("abbr-sm")) POS = LexcOptions.ABBR; else if (filein.contains("numeral-sm")) POS = LexcOptions.NUMERAL; else if (filein.contains("num-sm")) POS = LexcOptions.NUM; else if (filein.contains("particle-sm")) POS = LexcOptions.PARTICLE; else if (filein.contains("interjection-sm")) POS = LexcOptions.INTERJECTION; else if (filein.endsWith("/sme-lex.txt")) POS = LexcOptions.MIDDLE_NOUN; readLexc = new ReadLexc (filein, LANG, POS); Iterator iterator = readLexc.getEntries().iterator(); Entry entry; String word; while(iterator.hasNext()) { entry = iterator.next(); word = cleanWord(entry.getWord()); if (entry.getPOS() == LexcOptions.ABBR) { if (entry.getInfl().contains("-nodot")) { System.out.println(word + "\t"); } else if (entry.getInfl().contains("-dot")) { System.out.println(word + ".\t"); } else { System.out.println(word + ".\t"); System.out.println(word + "\t"); } } else System.out.println(word + "\t"); } } private String cleanWord(String word) { StringTokenizer st = new StringTokenizer(word); String clean; if (word.contains("+")) clean = st.nextToken("+"); else clean = st.nextToken(":"); clean = clean.replaceAll("#", ""); clean = clean.replace("^", ""); clean = clean.replaceAll("0", ""); clean = clean.replaceAll("%", ""); return clean; } }