package werti.uima.enhancer; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Stack; import java.util.StringTokenizer; import java.io.*; import org.apache.log4j.Logger; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.FSIterator; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import werti.uima.types.Enhancement; import werti.uima.types.annot.CGReading; import werti.uima.types.annot.CGToken; import werti.util.EnhancerUtils; import werti.util.StringListIterable; import werti.server.WERTiServlet; /** * Use the TAG-B TAG-I sequences resulting from the CG3 analysis with * {@link werti.ae.Vislcg3Annotator} to enhance spans corresponding * to the tags specified by the activity as tags of negation forms of verbs. * * @author Niels Ott * @author Adriane Boyd * @author Heli Uibo * */ public class Vislcg3PresFutIndEnhancer extends JCasAnnotator_ImplBase { private static final Logger log = Logger.getLogger(Vislcg3PresFutIndEnhancer.class); private String enhancement_type = WERTiServlet.enhancement_type; // colorize, click, mc or cloze - chosen by the user and sent to the servlet as a request parameter private List VPresFutTags; private static String CHUNK_BEGIN_SUFFIX = "-B"; private static String CHUNK_INSIDE_SUFFIX = "-I"; private final String lookupLoc = "/usr/local/bin/lookup"; private final String lookupFlags = "-flags mbTT -utf8"; private final String invertedFST = " /opt/smi/rus/bin/generator-gt-desc.xfst"; private final String FST = " /opt/smi/rus/bin/analyser-gt-desc.xfst"; // local paths: /*private final String lookupLoc = "/Users/mslm/bin/lookup"; private final String lookupFlags = "-flags mbTT -utf8"; private final String invertedFST = " /Users/mslm/main/langs/rus/src/generator-gt-desc.xfst"; private final String FST = " /Users/mslm/main/langs/rus/src/analyser-gt-desc.xfst"; */ /** * A runnable class that reads from a reader (that may * be fed by {@link Process}) and puts stuff read into a variable. * @author nott */ public class ExtCommandConsume2String implements Runnable { private BufferedReader reader; private boolean finished; private String buffer; /** * @param reader the reader to read from. */ public ExtCommandConsume2String(BufferedReader reader) { super(); this.reader = reader; finished = false; buffer = ""; } /** * Reads from the reader linewise and puts the result to the buffer. * See also {@link #getBuffer()} and {@link #isDone()}. */ public void run() { String line = null; try { while ( (line = reader.readLine()) != null ) { buffer += line + "\n"; } } catch (IOException e) { log.error("Error in reading from external command.", e); } finished = true; } /** * @return true if the reader read by this class has reached its end. */ public boolean isDone() { return finished; } /** * @return the string collected by this class or null if the stream has not reached * its end yet. */ public String getBuffer() { if ( ! finished ) { return null; } return buffer; } } @Override public void initialize(UimaContext context) throws ResourceInitializationException { log.info("Verb Pres/Fut tags "+VPresFutTags); super.initialize(context); VPresFutTags = Arrays.asList(((String)context.getConfigParameterValue("VPresFutTags")).split(",")); } @Override public void process(JCas cas) throws AnalysisEngineProcessException { log.info("Starting Verb Pres/Fut enhancement"); String enhancement_type = WERTiServlet.enhancement_type; // colorize, click, mc or cloze - chosen by the user and sent to the servlet as a request parameter // stack for started enhancements (chunk) // Stack enhancements = new Stack(); // keep track of ids for each annotation class HashMap classCounts = new HashMap(); for (String conT : VPresFutTags) { classCounts.put(conT, 0); log.info("Tag: "+conT); } // iterating over chunkTags instead of classCounts.keySet() because it is important to control the order in which // spans are enhanced for (String conT: VPresFutTags) { FSIterator cgTokenIter = cas.getAnnotationIndex(CGToken.type).iterator(); // remember previous token so we can getEnd() from it (chunk) // CGToken prev = null; int newId = 0; // go through tokens while (cgTokenIter.hasNext()) { CGToken cgt = (CGToken) cgTokenIter.next(); // more than one reading? don't mark up! /*if (!isSafe(cgt)) { continue; }*/ // Temporarily commented out because there are very few words that have one morphological reading. // analyze reading CGReading reading = cgt.getReadings(0); String lemma = "", aspect = "", transitivity = "", distractors = ""; if (containsTag(reading, conT, enhancement_type)) { if (enhancement_type.equals("cloze") || enhancement_type.equals("mc")) { // get lemma from the CG reading lemma = getLemma(reading); } if (enhancement_type.equals("mc")) { // get aspect from the CG reading: Impf, Perf aspect = getAspect(reading); // get transitivity from the CG reading: IV, TV transitivity = getTransitivity(reading); // generate the distractors, based on the lemma, aspect, and transitivity distractors = getDistractors(lemma, aspect, transitivity); } // make new enhancement Enhancement e = new Enhancement(cas); e.setRelevant(true); e.setBegin(cgt.getBegin()); e.setEnd(cgt.getEnd()); // increment id newId = classCounts.get(conT) + 1; String spanStartTag = ""; //log.info(spanStartTag); e.setEnhanceStart(spanStartTag); e.setEnhanceEnd(""); classCounts.put(conT, newId); //log.info(newId); // push onto stack //enhancements.push(e); // update CAS cas.addFsToIndexes(e); //e.addToIndexes(); //log.info("Started conjunction " + conT + "-" + newId + " at pos " + e.getBegin()); } //prev = cgt; } } log.info("Finished Verb Pres/Fut enhancement"); } /* * Determines whether the given token is safe, i.e. unambiguous */ private boolean isSafe(CGToken t) { return t.getReadings() != null && t.getReadings().size() == 1; } /* * Determines whether the given reading contains the given tag */ private boolean containsTag(CGReading cgr, String tag, String enhancement_type) { StringListIterable reading = new StringListIterable(cgr); String reading_str = ""; for (String rtag : reading) { reading_str = reading_str + rtag + " "; } //log.info("enhancement type is:"+enhancement_type); // If the exercise type is "practice" (cloze) then the derived forms, forms with clitics and proper nouns are excluded from the selection. if ((reading_str.contains("Der/") || reading_str.contains("Qst")) && (enhancement_type.equals("cloze") || enhancement_type.equals("mc"))) { log.info("derived form or form with clitics"); return false; } if (reading_str.contains(tag) && reading_str.contains(" V ")) { // Tag string contains the given tag sequence as a substring, plus the POS tag 'V'. log.info(cgr + " contains " + tag); return true; } //log.info(cgr + " does not contain " + tag); return false; } /* * Obtains the aspect from the morphological analysis if any (Impf, Perf) */ private String getAspect(CGReading cgr) { String aspect = ""; StringListIterable reading = new StringListIterable(cgr); String reading_str = ""; for (String rtag : reading) { reading_str = reading_str + rtag + " "; } if (reading_str.contains("Impf")) { aspect = "Impf"; } else if (reading_str.contains("Perf")) { aspect = "Perf"; } return aspect; } /* * Obtains the animacy from the morphological analysis if any (Anim, Inan) */ private String getTransitivity(CGReading cgr) { String transitivity = ""; StringListIterable reading = new StringListIterable(cgr); String reading_str = ""; for (String rtag : reading) { reading_str = reading_str + rtag + " "; } if (reading_str.contains("IV")) { transitivity = "IV"; } else if (reading_str.contains("TV")) { transitivity = "TV"; } return transitivity; } /* * Obtains the lemma from the CG reading. */ private String getLemma(CGReading cgr) { StringListIterable reading = new StringListIterable(cgr); String lemma = "", lemma_utf8 = ""; for (String rtag : reading) { if (rtag.charAt(0) == '\"') { lemma = rtag.substring(1,rtag.length()-1); log.info(cgr + " lemma: " + lemma); } } // Convert the lemma to utf8. - Not needed any more because the whole cg input and output is converted to utf8. /* try { byte[] b = lemma.getBytes(); lemma_utf8 = new String(b,"UTF-8"); } catch (UnsupportedEncodingException e) { System.out.println(e); }*/ //log.info(cgr + " does not contain " + tag); //log.info("lemma encoded in UTF8: " + lemma_utf8); return lemma; } /* * Generates distractors for the multiple choice exercise. */ private String getDistractors(String lemma, String aspect, String transitivity) { String tense = ""; if (aspect.equals("Impf")) { tense = "Prs"; } else if (aspect.equals("Perf")) { tense = "Fut"; } String[] distract_forms = {"Sg1", "Sg2", "Sg3", "Pl1", "Pl2", "Pl3"}; String str, word, result = "", generationInput = ""; try { if (lemma.contains("#")) { // correct lemma for compound words = morf analysis - V+... lemma = lemma.replace("#",""); String[] analysisPipeline = {"/bin/sh", "-c", "/bin/echo \"" + lemma + "\" | " + lookupLoc + " " + lookupFlags + " " + FST}; log.info("Morph analysis pipeline: "+analysisPipeline[2]); Process process = Runtime.getRuntime().exec(analysisPipeline); BufferedReader fromFST = new BufferedReader(new InputStreamReader(process.getInputStream(), "UTF8")); ExtCommandConsume2String stdoutConsumer = new ExtCommandConsume2String(fromFST); Thread stdoutConsumerThread = new Thread(stdoutConsumer, "FST STDOUT consumer"); stdoutConsumerThread.start(); try { stdoutConsumerThread.join(); } catch (InterruptedException e) { log.error("Error in joining output consumer of FST with regular thread, going mad.", e); return null; } fromFST.close(); String morfanal = stdoutConsumer.getBuffer(); String[] analysis = morfanal.split("\n"); // the word may be morhologically ambiguous String[] token = analysis[0].split("\t"); // take the first analysis lemma = token[1]; // the first token is word to be analysed and the second token is the morph analysis lemma = lemma.replace("Sg1",""); log.info("lemma of the compound word: "+lemma); for (int j=0; j < distract_forms.length; j++) { generationInput += lemma + distract_forms[j] + "\n"; } } else { for (int j=0; j < distract_forms.length; j++) { generationInput += lemma + "+V+" + aspect + "+" + transitivity + "+" + tense + "+" + distract_forms[j] + "\n"; } } String[] generationPipeline = {"/bin/sh", "-c", "/bin/echo \"" + generationInput + "\" | " + lookupLoc + " " + lookupFlags + " " + invertedFST}; log.info("Form generation pipeline: "+generationPipeline[2]); Process process2 = Runtime.getRuntime().exec(generationPipeline); BufferedReader fromIFST = new BufferedReader(new InputStreamReader(process2.getInputStream(), "UTF8")); ExtCommandConsume2String stdoutConsumer2 = new ExtCommandConsume2String(fromIFST); Thread stdoutConsumerThread2 = new Thread(stdoutConsumer2, "FST STDOUT consumer"); stdoutConsumerThread2.start(); try { stdoutConsumerThread2.join(); } catch (InterruptedException e) { log.error("Error in joining output consumer of VislCG with regular thread, going mad.", e); return null; } fromIFST.close(); String iFSToutput = stdoutConsumer2.getBuffer(); StringTokenizer tok = new StringTokenizer(iFSToutput); while (tok.hasMoreTokens()) { word = tok.nextToken(); log.info("ifst output:"+word); if (!word.contains("+") && !word.contains("-")) { // forms that could not be generated are excluded, as well as input strings of the iFST result = result + word + " "; } } } catch (IOException e) { System.out.println(e.getMessage()); } log.info("Generated forms read from the outputfile: "+result); return result; } }