import java.util.*; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.io.*; import no.divvun.Analyzer.Objects.*; import no.divvun.Analyzer.Client.*; import no.divvun.Analyzer.Communication.Response; import no.divvun.Analyzer.Communication.Parameters; public class Lexc2hunspell implements DivvunRequestListener { private List affixes = new ArrayList(); private DivvunBasicServices srv = null; private FileWriter suffixfile = null; private FileWriter dicfile = null; private FileWriter debugfile = null; private Boolean shouldDebug = false; private ReadLexc readLexc; private String DebugFile; private String compoundmiddle = "63010"; private String compoundend = "63001"; private String compoundpermitflag = "64500"; private String compoundforbidflag = "63000"; private String clitics = "64000"; private String hyphen = "65000"; private String compoundsggenga = "63002"; private String compoundplgengp = "63003"; private String compoundbegin = "63004"; private String compoundsgnomleft = "63007"; private String compoundsggenleftna ="63008"; private String compoundplgenleftnp = "63009"; /** * @param args */ public static void main( String[] args) throws IOException { // TODO Auto-generated method stub Lexc2hunspell hunspell = new Lexc2hunspell(); hunspell.generateHunspell(args); } public Lexc2hunspell() throws IOException { DebugFile = ""; } public void generateHunspell(String[] args) throws IOException { int x = 0; int LANG = 0; if (args[0].equals("--debug")) { setDebugFlag(true); setDebugOutputFile(args[1]); x = 2; } for (; x < args.length; x++) { String filein = args[x]; int POS = getPOS(filein); LANG = getLang(filein); readLexc = new ReadLexc (filein, LANG, POS); setupServer(); setupServerParams(LANG); setupOutputFiles(LANG, filein); if (srv != null) { Iterator iterator = readLexc.getEntries().iterator(); while (iterator.hasNext()) { generateHunspellParadigm(iterator.next()); // dicfile.flush(); // suffixfile.flush(); // System.out.print('o'); } srv.disconnect(); } } printAffixes(affixes); postprocessFiles(LANG); } private void setDebugOutputFile(String df) { DebugFile = df; } private String getDebugOutputFile() { return DebugFile; } private void setupServer() { try { // srv = DivvunRequestManager.newDivvunBasicServices(this, "129.242.220.111", 8089); srv = DivvunRequestManager.newDivvunBasicServices(this, "localhost", 8089); } catch (Exception e) { System.err.println(e.getMessage()); } } public void setupServerParams(int LANG) { if (srv != null) { Parameters param = new Parameters(); param.setLang(LexcOptions.getOption(LANG)); param.setFst("/Users/boerre/Dokumenter/gt/" + LexcOptions.getOption(LANG) + "/bin/i" + LexcOptions.getOption(LANG) + "-norm.fst"); param.setParadigm(true); // param.setHyphenate(true); param.setGenerate(true); // param.setUseFilter(true); param.setXml_in(true); param.setXml_out(true); srv.initServer(param); } else { System.err.println("The server is not initialised, is the server_anl.pl program running?"); System.exit(2); } } private void setupOutputFiles(int lang, String filein) throws IOException { int ux = filein.split("/").length; String foil = filein.split("/")[ux - 1]; foil = foil.split("-")[0]; System.err.println("filein er " + filein); if (suffixfile == null) { suffixfile = new FileWriter(LexcOptions.getOption(lang) + ".aff"); suffixfile.write("SET UTF-8\n\nFLAG num\n\n"); suffixfile.write("Allow hyphens in words\n"); suffixfile.write("BREAK 1\n"); suffixfile.write("BREAK -\n\n"); suffixfile.write("WORDCHARS -\n\n"); suffixfile.write("#compound flags\n\n"); suffixfile.write("COMPOUNDBEGIN " + compoundbegin + "\n"); suffixfile.write("COMPOUNDMIDDLE " + compoundmiddle + "\n"); suffixfile.write("COMPOUNDEND " + compoundend + "\n"); suffixfile.write("COMPOUNDPERMITFLAG " + compoundpermitflag + "\n"); suffixfile.write("COMPOUNDFORBIDFLAG " + compoundforbidflag + "\n\n"); suffixfile.write("COMPOUNDRULE 2\n"); suffixfile.write("COMPOUNDRULE " + compoundpermitflag + "*" + compoundsggenga + "," + compoundsggenleftna + "\n"); suffixfile.write("COMPOUNDRULE " + compoundpermitflag + "*" + compoundplgengp + "," + compoundplgenleftnp + "\n\n"); suffixfile.write("PFX " + hyphen + " Y 1\n"); suffixfile.write("PFX " + hyphen + " Y -/" + compoundpermitflag + "\n\n"); suffixfile.write("SFX " + clitics + " Y 14\n"); suffixfile.write("SFX " + clitics + " 0 najigo . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 goson . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 go . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 ge . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 gen . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 ges . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 gis . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 nai . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 ba . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 be . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 hal . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 han . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 bat . +Clt\n"); suffixfile.write("SFX " + clitics + " 0 son . +Clt\n\n"); } if (dicfile == null) dicfile = new FileWriter(LexcOptions.getOption(lang) + ".dic.tmp"); if (getDebugFlag() == true) debugfile = new FileWriter(getDebugOutputFile()); } public void generateHunspellParadigm(Entry entry) throws IOException { String leftStem = null; String ordinal = null; if (entry.getWord().contains("+A+Ord")) ordinal = "A"; leftStem = cleanWord(entry.getWord()); entry.setWord(leftStem); Paradigm paradigm = getParadigm(entry, ordinal); if (paradigm == null) System.err.println (" And we weren't able to get a paradigm for the word: " + leftStem); else { if (getDebugFlag() == true) debugfile.write(printParadigm(paradigm)); entry.setParadigm(paradigm); Map> dicmap = new HashMap>(); for (int i = 0; i < paradigm.size(); i++) { Reading reading = paradigm.get(i).getReading(); // String analysis = reading.getValue(); String word = cleanWord(reading.getNext().getValue()); String parts[] = word.split(">"); if (parts.length > 2 ) System.out.println("NB!: " + word); List suffixlist = dicmap.get(parts[0]); if (suffixlist == null) { dicmap.put(parts[0], suffixlist = new ArrayList()); } if (parts.length > 1) { String t = parts[1].replace("#", ""); if (!affixes.contains(t)) { affixes.add(t); //// System.out.println("affixes: " + affixes.toString()); } if (!suffixlist.contains(affixes.indexOf(t))) { suffixlist.add(affixes.indexOf(t)); //// System.out.println("suffixlist " + suffixlist.toString()); } } } entry.setParadigm(null); printStems(dicmap); } } private String cleanWord(String word) { StringTokenizer st = new StringTokenizer(word); String clean; if (word.contains("+") && !word.contains("+ShCmp")) { clean = st.nextToken("+"); } else { clean = st.nextToken(":"); } // System.out.println("clean before replace: " + clean); // System.out.println("clean bf: " + clean); Pattern pattern = Pattern.compile("[#^0%]"); Matcher matcher = pattern.matcher(clean); clean = matcher.replaceAll(""); // System.out.println("clean af: " + clean); return clean; } private Paradigm getParadigm(Entry entry, String ordinal) { Paradigm paradigm; String word = entry.getWord(); // System.err.println("The word inside getParadigm: " + word); if (ordinal == null) { // System.out.println("null ordinal " + entry.getsPOS()); paradigm = srv.getFullParadigm(word, entry.getsPOS()); if (entry.getsPOS().equals("Po") && paradigm == null) paradigm = srv.getFullParadigm(word, "Pr"); } else { // System.out.println("ordinal 1"); paradigm = srv.getFullParadigm(word, ordinal); } return paradigm; } public void requestCompleted(Response response) { // TODO Auto-generated method stub } public void requestStatusChanged(DivvunRequestStatus divvunrequeststatus) { // TODO Auto-generated method stub } public void printAffixes(List suffixes) { StringBuilder buffer = new StringBuilder(); for (int i = 0; i < suffixes.size(); i++) { buffer.append("SFX " + i + " Y 1\n"); buffer.append("SFX " + i + " " + 0 + " " + suffixes.get(i) + "\n\n"); } try { suffixfile.write(buffer.toString()); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public void printStems(Map> dicmap) { StringBuilder buffer = new StringBuilder(); for (Map.Entry> e : dicmap.entrySet()) { String word = e.getKey(); List suffixclasses = e.getValue(); buffer.append(word); if (suffixclasses.size() > 0) { buffer.append("/"); for (Iterator itr = suffixclasses.iterator(); itr.hasNext(); ) buffer.append(itr.next() + ","); } buffer.deleteCharAt(buffer.length() - 1); // delete the last , it is not needed buffer.append("\n"); } try { dicfile.write(buffer.toString()); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } private Boolean getDebugFlag() { return shouldDebug; } private void setDebugFlag(Boolean flag) { shouldDebug = flag; } private int getPOS(String filein){ int POS = 0; if (filein.contains("conjunction-sm")) POS = LexcOptions.CONJUNCTION; else if (filein.contains("subjunction-sm")) POS = LexcOptions.SUBJUNCTION; else if (filein.contains("propernoun-sm")) POS = LexcOptions.PROPERNOUN; else // pronoun has to be above noun-sme, otherwise it is assigned as NOUN if (filein.contains("pronoun-sm")) POS = LexcOptions.PRONOUN; else if (filein.contains("pp-sm")) POS = LexcOptions.ADPOSITION; else if (filein.contains("noun-sm")) POS = LexcOptions.NOUN; else if (filein.contains("verb-sm")) POS = LexcOptions.VERB; else if (filein.contains("adj-sm")) POS = LexcOptions.ADJECTIVE; else if (filein.contains("adv-sm")) POS = LexcOptions.ADVERB; else if (filein.contains("acro-sm")) POS = LexcOptions.ACRO; else if (filein.contains("abbr-sm")) POS = LexcOptions.ABBR; else if (filein.contains("numeral-sm")) POS = LexcOptions.NUMERAL; else if (filein.contains("num-sm")) POS = LexcOptions.NUM; else if (filein.contains("particle-sm")) POS = LexcOptions.PARTICLE; else if (filein.contains("interjection-sm")) POS = LexcOptions.INTERJECTION; return POS; } private String printParadigm(Paradigm paradigm) { StringBuilder buffer = new StringBuilder(); for(int i = 0; i < paradigm.size(); i++) { String word = paradigm.get(i).getReading().getNext().getValue(); // Pattern pattern = Pattern.compile("[XYQW][0-9]"); // Matcher matcher = pattern.matcher(word); // word = matcher.replaceAll(""); buffer.append(word + "\n"); } return buffer.toString(); } private String getCompoundInfo(String analysis, String compound) { // System.err.println(analysis + "\t" + compound); String flag = ""; if (compound.contains("+None")) flag = compoundforbidflag; else if (compound.contains("+Last")) flag = compoundpermitflag + "," + compoundend; //COMPOUNDEND else if (analysis.endsWith("Sg+Gen")) flag = compoundpermitflag + "," + compoundsggenga; //COMPOUNDSGGEN Ga else if (analysis.endsWith("Pl+Gen")) flag = compoundpermitflag + "," + compoundplgengp; //COMPOUNDPLGEN Gp else if (analysis.endsWith("Sg+Nom")) { if (compound.contains("+SgNomCmp") || compound.contains("+SgNomCmp")) flag = compoundpermitflag + "," + compoundbegin + "," + compoundmiddle; //COMPOUNDBEGIN,COMPOUNDMIDDLE else flag = compoundpermitflag + "," + compoundend; } else if (analysis.endsWith("+SgGenCmp")) { if (compound.contains("+SgGenCmp")) flag = compoundpermitflag + "," + compoundbegin + "," + compoundsggenga; else flag = compoundpermitflag + "," + compoundsggenga; //COMPUNDSGGENCMP Ga } else if (analysis.endsWith("+PlGenCmp")) { if (compound.contains("+PlGenCmp")) flag = compoundpermitflag + "," + compoundbegin + "," + compoundplgengp; else flag = compoundpermitflag + "," + compoundplgengp; //COMPUNDPLGENCMP Gp } if (compound.contains("+SgLeft") || compound.contains("+SgNomLeft")) { if (!flag.equals("")) flag += ","; flag += compoundpermitflag + "," + compoundsgnomleft; //COMPOUNDSGNOMLEFT } if (compound.contains("+SgGenLeft")) { if (!flag.equals("")) flag += ","; flag += compoundpermitflag + "," + compoundsggenleftna; //COMPOUNDSGGENLEFT Na } if (compound.contains("+PlGenLeft")) { if (!flag.equals("")) flag += ","; flag += compoundpermitflag + "," + compoundplgenleftnp; //COMPOUNDPLGENLEFT Np } //else flag = "0"; return flag; } private int getLang(String filein) { if (filein.contains("sme-lex")) return LexcOptions.SME; else if (filein.contains("smj-lex")) return LexcOptions.SMJ; else if (filein.contains("sma-lex")) return LexcOptions.SMA; else { System.err.println("Wrong type of file: Only sm[aej]-lex files accepted"); return -1; } } private void postprocessFiles(int lang) throws IOException { if (suffixfile != null) suffixfile.close(); if (dicfile != null) { dicfile.flush(); dicfile.close(); // FileWriter outdic = new FileWriter(LexcOptions.getOption(lang) + ".dic"); // outdic.write(getStemCount().toString() + "\n"); // FileReader indic = new FileReader(LexcOptions.getOption(lang) + ".dic.tmp"); // // char[] buf = new char[1024]; // int len; // while ((len = indic.read(buf)) > 0) { // outdic.write(buf, 0, len); // } // indic.close(); // outdic.flush(); // outdic.close(); } if (debugfile != null) debugfile.close(); } }