import scala.io.Codec import scala.xml.{Comment, Elem, NodeSeq, PrettyPrinter, XML} import collection.mutable.{HashMap, MultiMap, Set} import java.io.FileOutputStream import java.nio.channels.Channels import TermwikiParser._ object Extract extends App { implicit val codec = Codec.UTF8 val file = "../../words/terms/termwiki/dump.xml" val bidixXml = XML.loadFile("apertium-sme-nob.sme-nob.dix") val iterator = TermwikiParser.iterator(file) val analysers: HashMap[String, Analyzer] = HashMap() analysers += ("se" -> new Analyzer("sme", "../../langs/sme/tools/mt/apertium/analyser-mt-gt-desc.hfstol")) analysers += ("nb" -> new Analyzer("nob", "../../langs/nob/tools/mt/apertium/analyser-mt-gt-desc.hfstol")) analysers += ("fi" -> new Analyzer("fin", "../../langs/fin/tools/mt/apertium/analyser-mt-gt-desc.hfstol")) analysers += ("sma" -> new Analyzer("sma", "../../langs/sma/tools/mt/apertium/analyser-mt-gt-desc.hfstol")) analysers += ("smj" -> new Analyzer("smj", "../../langs/smj/tools/mt/apertium/analyser-mt-gt-desc.hfstol")) analysers += ("smn" -> new Analyzer("smn", "../../langs/smn/tools/mt/apertium/analyser-mt-gt-desc.hfstol")) val bidix = List("se" -> "nb", "se" -> "sma", "se" -> "smj", "se" -> "smn", "se" -> "fi") val pairs = List("se" -> "smj", "se" -> "smn", "se" -> "nb", "se" -> "sma", "nb" -> "se", "nb" -> "sma", "nb" -> "smj", "fi" -> "smn", "fi" -> "sms", "fi" -> "se", "se" -> "fi") val files = new HashMap[String,Set[String]] with MultiMap[String,String] val bidixFiles = new HashMap[String,Set[NodeSeq]] with MultiMap[String,NodeSeq] var bidixTerms: Set[String] = (bidixXml \\ "l").map(_.text).toSet.to[collection.mutable.Set] iterator.foreach { term => bidix.foreach { pair => var src: String = "" var trg: String = "" var srcAnalysis: List[String] = List() var trgAnalysis: List[String] = List() val reFilter = ".*\\+[Sg|Pl].*\\+[Nom|Indef].*|.*\\+V(\\+TV|\\+IV)\\+Inf.*" term.entries foreach { e => e.language match { case pair._1 => src = e.expression // if (!(bidixTerms \\ "l").map(_.text).toSet.contains(e.expression)) => case pair._2 => trg = e.expression case _ => } } if (pair._1 == "se" && !bidixTerms.exists(_ == src)) { if (src != "") { // bidixTerms += src srcAnalysis = analysers(pair._1).analyse(src).filter(x => (x.split("\\+")(0) == src && (x matches reFilter))).sorted.distinct } if (trg != "") trgAnalysis = analysers(pair._2).analyse(trg).filter(x => (x.split("\\+")(0) == trg && (x matches reFilter))).sorted.distinct } srcAnalysis foreach { s => trgAnalysis foreach { t => if ((s.size != 0 && t.size != 0)) bidixFiles.addBinding(pair._1 + "-" + pair._2, toBidixXml(term, s, t)) } } } pairs.foreach { pair => val srclng = pair._1 val trglng = pair._2 var src = "" var trg = "" var defn = "" var expl = "" term.entries foreach { e => e.language match { case `srclng` => src = e.expression defn = Option(e.definition).getOrElse("") expl = Option(e.explanation).getOrElse("") case `trglng` => trg = e.expression case _ => } } if (src != "" && trg != "") { val line = src + "\t" + trg + "\t" + term.category + " " + expl + " " + defn files.addBinding(srclng + "-" + trglng, line) } } } iterator.close() bidixFiles foreach { f => // val pp = new PrettyPrinter(80, 2) val fileoutput = new FileOutputStream("bidix/" + f._1 + ".dix") val filewriter = Channels.newWriter(fileoutput.getChannel(), "UTF-8") val sdefs: Set[NodeSeq] = f._2 flatMap {_ \\ "s"} try { filewriter.write("\n") filewriter.write("\n") filewriter.write("\n") sdefs foreach {s => filewriter.write(rename("sdef", s).toString + "\n")} filewriter.write("\n") filewriter.write("
\n") f._2.toSeq.sortBy(x => (x \\ "l").text) foreach {x => filewriter.write(x.toString + "\n")} filewriter.write("
\n") filewriter.write("
\n") } finally { filewriter.close() } } files foreach {f => val fos = new FileOutputStream("omegat/" + f._1 + ".txt") val writer = Channels.newWriter(fos.getChannel(), "UTF-8") try { f._2 foreach {x => writer.write(x); writer.write("\n")} } finally { writer.close() } } private def rename(label: String, n: NodeSeq): NodeSeq = n match { case Elem(prefix, _, attributes, scope, child @ _*) => Elem(prefix, label, attributes, scope, true, child: _*) case _ => n } private def toBidixXml(term: Term, src: String, trg: String): NodeSeq = { val reStem = """([^\+]+)(\+.*)""".r val reAnal = """([^\+]+)""".r try { val reStem(l, lAnal) = src.split("\\s+")(0) val reStem(r, rAnal) = trg.split("\\s+")(0) val sLeft = reAnal.findAllIn(lAnal).map(posToBidix(_)) val sRight = reAnal.findAllIn(rAnal).map(posToBidix(_)) // var chpos = "" val chpos = if (reAnal.findFirstIn(rAnal) != reAnal.findFirstIn(lAnal)) Comment("CHECK THIS") else NodeSeq.Empty return

{l}{sLeft}{r}{sRight}

++ {chpos} } catch { case ex: MatchError => println("MatchError: [" + src + ", " + trg + "]") } NodeSeq.Empty } private def posToBidix(pos: String): NodeSeq = pos match { case "N" => case "A" => case "V" => case "TV" => case "IV" => // case _ => case _ => NodeSeq.Empty } }