import scala.io.Codec import java.io.{DataInputStream, FileInputStream} import collection.JavaConverters._ import collection.mutable.ArrayBuffer import net.sf.hfst.Transducer import net.sf.hfst.TransducerAlphabet import net.sf.hfst.TransducerHeader import net.sf.hfst.UnweightedTransducer import net.sf.hfst.WeightedTransducer import net.sf.hfst.NoTokenizationException class Analyzer(val lang: String, val path: String = "") { private var transducer: Transducer = null private def getTransducer(): Transducer = { if (transducer != null) { return transducer } val file: String = if (path == "") "../../langs/" + lang + "/src/analyser-gt-norm.hfstol" else path // val file: String = "../../langs/" + lang + "/src/analyser-gt-norm.hfstol" // else // val file = path try { transducer = loadTransducer(new FileInputStream(file)) return transducer } catch { case e: Exception => println("Ex: " + e) } null } @throws(classOf[java.io.FileNotFoundException]) @throws(classOf[java.io.IOException]) private def loadTransducer(transducerfile: FileInputStream): Transducer = { val charstream = new DataInputStream(transducerfile) val h = new TransducerHeader(transducerfile) val a = new TransducerAlphabet(charstream, h.getSymbolCount()) if (h.isWeighted()) { new WeightedTransducer(transducerfile, h, a) } else { new UnweightedTransducer(transducerfile, h, a) } } def analyse(strOrig: String): List[String] = { val analys = new ArrayBuffer[String]() val transducer: Transducer = getTransducer() try { val res: Iterable[String] = transducer.analyze(strOrig).asScala res.foreach { a => analys.append(a) } } catch { case ex: NoTokenizationException => println("NoTokenizationException: " + strOrig) } analys.toList } } object Demo extends App { override def main(args: Array[String]) { implicit val codec = Codec.UTF8 val analyzer = new Analyzer("sme") val res = analyzer.analyse(args(0)) println(res) } }