package werti.uima.enhancer; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Stack; import java.util.StringTokenizer; import java.io.*; import org.apache.log4j.Logger; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import werti.uima.types.Enhancement; import werti.uima.types.annot.CGReading; import werti.uima.types.annot.CGToken; import werti.util.EnhancerUtils; import werti.util.StringListIterable; import werti.server.WERTiServlet; /** * Use the TAG-B TAG-I sequences resulting from the CG3 analysis with * {@link werti.ae.Vislcg3Annotator} to enhance spans corresponding * to the tags specified by the activity as tags of negation forms of verbs. * * @author Niels Ott * @author Adriane Boyd * @author Heli Uibo * */ public class Vislcg3NounsGenEnhancer extends JCasAnnotator_ImplBase { private static final Logger log = Logger.getLogger(Vislcg3NounsGenEnhancer.class); private String enhancement_type = WERTiServlet.enhancement_type; // colorize, click, mc or cloze - chosen by the user and sent to the servlet as a request parameter private List NGenTags; private static String CHUNK_BEGIN_SUFFIX = "-B"; private static String CHUNK_INSIDE_SUFFIX = "-I"; private final String lookupLoc = "/usr/local/bin/lookup"; private final String lookupFlags = "-flags mbTT -utf8"; private final String invertedFST = " /opt/smi/rus/bin/generator-gt-desc.xfst"; private final String FST = " /opt/smi/rus/bin/analyser-gt-desc.xfst"; // local paths: /*private final String lookupLoc = "/Users/mslm/bin/lookup"; private final String lookupFlags = "-flags mbTT -utf8"; private final String invertedFST = " /Users/mslm/main/langs/rus/src/generator-gt-desc.xfst"; private final String FST = " /Users/mslm/main/langs/rus/src/analyser-gt-desc.xfst"; */ /** * A runnable class that reads from a reader (that may * be fed by {@link Process}) and puts stuff read into a variable. * @author nott */ public class ExtCommandConsume2String implements Runnable { private BufferedReader reader; private boolean finished; private String buffer; /** * @param reader the reader to read from. */ public ExtCommandConsume2String(BufferedReader reader) { super(); this.reader = reader; finished = false; buffer = ""; } /** * Reads from the reader linewise and puts the result to the buffer. * See also {@link #getBuffer()} and {@link #isDone()}. */ public void run() { String line = null; try { while ( (line = reader.readLine()) != null ) { buffer += line + "\n"; } } catch (IOException e) { log.error("Error in reading from external command.", e); } finished = true; } /** * @return true if the reader read by this class has reached its end. */ public boolean isDone() { return finished; } /** * @return the string collected by this class or null if the stream has not reached * its end yet. */ public String getBuffer() { if ( ! finished ) { return null; } return buffer; } } @Override public void initialize(UimaContext context) throws ResourceInitializationException { log.info("Gen Noun tags "+NGenTags); super.initialize(context); NGenTags = Arrays.asList(((String)context.getConfigParameterValue("NGenTags")).split(",")); } @Override public void process(JCas cas) throws AnalysisEngineProcessException { log.info("Starting Gen Noun enhancement"); String enhancement_type = WERTiServlet.enhancement_type; // colorize, click, mc or cloze - chosen by the user and sent to the servlet as a request parameter // stack for started enhancements (chunk) // Stack enhancements = new Stack(); // keep track of ids for each annotation class HashMap classCounts = new HashMap(); for (String conT : NGenTags) { classCounts.put(conT, 0); log.info("Tag: "+conT); } // iterating over chunkTags instead of classCounts.keySet() because it is important to control the order in which // spans are enhanced for (String conT: NGenTags) { FSIterator cgTokenIter = cas.getAnnotationIndex(CGToken.type).iterator(); // remember previous token so we can getEnd() from it (chunk) // CGToken prev = null; int newId = 0; // go through tokens while (cgTokenIter.hasNext()) { CGToken cgt = (CGToken) cgTokenIter.next(); // more than one reading? don't mark up! /*if (!isSafe(cgt)) { continue; }*/ // Temporarily commented out because there are very few words that have one morphological reading. // analyze reading CGReading reading = cgt.getReadings(0); String lemma = "", gender = "", animacy = "", distractors = ""; if (containsTag(reading, conT, enhancement_type)) { if (enhancement_type.equals("cloze") || enhancement_type.equals("mc")) { // get lemma from the CG reading lemma = getLemma(reading); } if (enhancement_type.equals("mc")) { boolean prop = false; // Proper nouns have the tag "Prop" in the morphological information. This is needed when generating distractors. if (containsTag(reading, "Prop", enhancement_type)) { prop = true; } // get gender from the CG reading: Fem, Msc, Neu gender = getGender(reading); // get animacy from the CG reading: Anim, Inan animacy = getAnimacy(reading); // generate the distractors, based on the lemma, stemtype and if it is a proper noun or not distractors = getDistractors(lemma, gender, animacy, prop); } // make new enhancement Enhancement e = new Enhancement(cas); e.setRelevant(true); e.setBegin(cgt.getBegin()); e.setEnd(cgt.getEnd()); // increment id newId = classCounts.get(conT) + 1; String spanStartTag = ""; //log.info(spanStartTag); e.setEnhanceStart(spanStartTag); e.setEnhanceEnd(""); classCounts.put(conT, newId); //log.info(newId); // push onto stack //enhancements.push(e); // update CAS cas.addFsToIndexes(e); //e.addToIndexes(); //log.info("Started conjunction " + conT + "-" + newId + " at pos " + e.getBegin()); } //prev = cgt; } } log.info("Finished Gen Noun enhancement"); } /* * Determines whether the given token is safe, i.e. unambiguous */ private boolean isSafe(CGToken t) { return t.getReadings() != null && t.getReadings().size() == 1; } /* * Determines whether the given reading contains the given tag */ private boolean containsTag(CGReading cgr, String tag, String enhancement_type) { StringListIterable reading = new StringListIterable(cgr); String reading_str = ""; for (String rtag : reading) { reading_str = reading_str + rtag + " "; } //log.info("enhancement type is:"+enhancement_type); // If the exercise type is "practice" (cloze) then the derived forms, forms with clitics and proper nouns are excluded from the selection. if ((reading_str.contains("Der/") || reading_str.contains("Qst")) && (enhancement_type.equals("cloze") || enhancement_type.equals("mc"))) { log.info("derived form or form with clitics"); return false; } if (reading_str.contains(tag) && reading_str.contains(" N ")) { // Tag string contains the given tag sequence as a substring, plus the POS tag 'N'. log.info(cgr + " contains " + tag); return true; } //log.info(cgr + " does not contain " + tag); return false; } /* * Obtains the gender from the morphological analysis if any (Fem, Msc, Neu) */ private String getGender(CGReading cgr) { String gender = ""; StringListIterable reading = new StringListIterable(cgr); String reading_str = ""; for (String rtag : reading) { reading_str = reading_str + rtag + " "; } if (reading_str.contains("Fem")) { gender = "Fem"; } else if (reading_str.contains("Msc")) { gender = "Msc"; } else if (reading_str.contains("Neu")) { gender = "Neu"; } return gender; } /* * Obtains the animacy from the morphological analysis if any (Anim, Inan) */ private String getAnimacy(CGReading cgr) { String animacy = ""; StringListIterable reading = new StringListIterable(cgr); String reading_str = ""; for (String rtag : reading) { reading_str = reading_str + rtag + " "; } if (reading_str.contains("Anim")) { animacy = "Anim"; } else if (reading_str.contains("Inan")) { animacy = "Inan"; } return animacy; } /* * Obtains the lemma from the CG reading. */ private String getLemma(CGReading cgr) { StringListIterable reading = new StringListIterable(cgr); String lemma = "", lemma_utf8 = ""; for (String rtag : reading) { if (rtag.charAt(0) == '\"') { lemma = rtag.substring(1,rtag.length()-1); log.info(cgr + " lemma: " + lemma); } } // Convert the lemma to utf8. - Not needed any more because the whole cg input and output is converted to utf8. /* try { byte[] b = lemma.getBytes(); lemma_utf8 = new String(b,"UTF-8"); } catch (UnsupportedEncodingException e) { System.out.println(e); }*/ //log.info(cgr + " does not contain " + tag); //log.info("lemma encoded in UTF8: " + lemma_utf8); return lemma; } /* * Generates distractors for the multiple choice exercise. */ private String getDistractors(String lemma, String gender, String animacy, boolean propernoun) { String[] distract_forms = {"Sg+Nom", "Sg+Acc", "Sg+Gen", "Sg+Loc", "Sg+Dat", "Sg+Ins"}; String str, word, result = "", generationInput = "", propN = ""; if (propernoun) { propN = "+Prop"; } try { if (lemma.contains("#")) { // correct lemma for compound words = morf analysis - N+Sg+Nom lemma = lemma.replace("#",""); String[] analysisPipeline = {"/bin/sh", "-c", "/bin/echo \"" + lemma + "\" | " + lookupLoc + " " + lookupFlags + " " + FST}; log.info("Morph analysis pipeline: "+analysisPipeline[2]); Process process = Runtime.getRuntime().exec(analysisPipeline); BufferedReader fromFST = new BufferedReader(new InputStreamReader(process.getInputStream(), "UTF8")); ExtCommandConsume2String stdoutConsumer = new ExtCommandConsume2String(fromFST); Thread stdoutConsumerThread = new Thread(stdoutConsumer, "FST STDOUT consumer"); stdoutConsumerThread.start(); try { stdoutConsumerThread.join(); } catch (InterruptedException e) { log.error("Error in joining output consumer of FST with regular thread, going mad.", e); return null; } fromFST.close(); String morfanal = stdoutConsumer.getBuffer(); String[] analysis = morfanal.split("\n"); // the word may be morhologically ambiguous String[] token = analysis[0].split("\t"); // take the first analysis lemma = token[1]; // the first token is word to be analysed and the second token is the morph analysis lemma = lemma.replace("Sg+Nom",""); log.info("lemma of the compound word: "+lemma); for (int j=0; j < distract_forms.length; j++) { generationInput += lemma + distract_forms[j] + "\n"; } } else { for (int j=0; j < distract_forms.length; j++) { generationInput += lemma + "+N+" + gender + "+" + animacy + "+" + distract_forms[j] + "\n"; } } String[] generationPipeline = {"/bin/sh", "-c", "/bin/echo \"" + generationInput + "\" | " + lookupLoc + " " + lookupFlags + " " + invertedFST}; log.info("Form generation pipeline: "+generationPipeline[2]); Process process2 = Runtime.getRuntime().exec(generationPipeline); BufferedReader fromIFST = new BufferedReader(new InputStreamReader(process2.getInputStream(), "UTF8")); ExtCommandConsume2String stdoutConsumer2 = new ExtCommandConsume2String(fromIFST); Thread stdoutConsumerThread2 = new Thread(stdoutConsumer2, "FST STDOUT consumer"); stdoutConsumerThread2.start(); try { stdoutConsumerThread2.join(); } catch (InterruptedException e) { log.error("Error in joining output consumer of VislCG with regular thread, going mad.", e); return null; } fromIFST.close(); String iFSToutput = stdoutConsumer2.getBuffer(); StringTokenizer tok = new StringTokenizer(iFSToutput); while (tok.hasMoreTokens()) { word = tok.nextToken(); log.info("ifst output:"+word); if (!word.contains("+") && !word.contains("-")) { // forms that could not be generated are excluded, as well as input strings of the iFST result = result + word + " "; } } } catch (IOException e) { System.out.println(e.getMessage()); } log.info("Generated forms read from the outputfile: "+result); return result; } }