package werti.uima.ae; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import opennlp.tools.chunker.ChunkerME; import org.apache.log4j.Logger; import org.apache.uima.UimaContext; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; import org.apache.uima.resource.ResourceInitializationException; import werti.WERTiContext; import werti.WERTiContext.WERTiContextException; import werti.uima.types.annot.SentenceAnnotation; import werti.uima.types.annot.Token; /** * Wrapper for OpenNlp chunker. * * Depends on annotation from {@link OpenNlpTokenizer} and {@link OpenNlpTagger}. * * @author Adriane Boyd * */ public class OpenNlpChunker extends JCasAnnotator_ImplBase { private Map chunkers; private static final Logger log = Logger.getLogger(OpenNlpChunker.class); /* (non-Javadoc) * @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext) */ @Override public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); try { chunkers = new HashMap(); chunkers.put("en", WERTiContext.request(ChunkerME.class, "en")); chunkers.put("es", WERTiContext.request(ChunkerME.class, "es")); } catch (WERTiContextException wce) { throw new ResourceInitializationException(wce); } } @SuppressWarnings("unchecked") @Override public void process(JCas jcas) throws AnalysisEngineProcessException { log.debug("Starting chunk annotation"); final AnnotationIndex sentIndex = jcas.getAnnotationIndex(SentenceAnnotation.type); final AnnotationIndex tokenIndex = jcas.getAnnotationIndex(Token.type); final Iterator sit = sentIndex.iterator(); final String lang = jcas.getDocumentLanguage(); ChunkerME chunker; if (chunkers.containsKey(lang)) { chunker = chunkers.get(lang); } else { log.error("No tagger for language: " + lang); throw new AnalysisEngineProcessException(); } while (sit.hasNext()) { List tokenlist = new ArrayList(); List tokens = new ArrayList(); List tags = new ArrayList(); List chunks; final Iterator tit = tokenIndex.subiterator(sit.next()); while (tit.hasNext()) { Token t = tit.next(); tokenlist.add(t); tokens.add(t.getCoveredText()); tags.add(t.getTag()); } chunks = (ArrayList) chunker.chunk(tokens, tags); for (int i = 0; i < chunks.size(); i++) { tokenlist.get(i).setChunk(chunks.get(i)); } } log.debug("Finished chunk annotation"); } }