import scala.io.Codec
import scala.xml.{Comment, Elem, NodeSeq, PrettyPrinter, XML}
import collection.mutable.{HashMap, MultiMap, Set}
import java.io.FileOutputStream
import java.nio.channels.Channels
import TermwikiParser._
object Extract extends App {
implicit val codec = Codec.UTF8
val file = "../../words/terms/termwiki/dump.xml"
val bidixXml = XML.loadFile("apertium-sme-nob.sme-nob.dix")
val iterator = TermwikiParser.iterator(file)
val analysers: HashMap[String, Analyzer] = HashMap()
analysers += ("se" -> new Analyzer("sme", "../../langs/sme/tools/mt/apertium/analyser-mt-gt-desc.hfstol"))
analysers += ("nb" -> new Analyzer("nob", "../../langs/nob/tools/mt/apertium/analyser-mt-gt-desc.hfstol"))
analysers += ("fi" -> new Analyzer("fin", "../../langs/fin/tools/mt/apertium/analyser-mt-gt-desc.hfstol"))
analysers += ("sma" -> new Analyzer("sma", "../../langs/sma/tools/mt/apertium/analyser-mt-gt-desc.hfstol"))
analysers += ("smj" -> new Analyzer("smj", "../../langs/smj/tools/mt/apertium/analyser-mt-gt-desc.hfstol"))
analysers += ("smn" -> new Analyzer("smn", "../../langs/smn/tools/mt/apertium/analyser-mt-gt-desc.hfstol"))
val bidix = List("se" -> "nb", "se" -> "sma",
"se" -> "smj", "se" -> "smn",
"se" -> "fi")
val pairs = List("se" -> "smj", "se" -> "smn",
"se" -> "nb", "se" -> "sma",
"nb" -> "se", "nb" -> "sma",
"nb" -> "smj", "fi" -> "smn",
"fi" -> "sms", "fi" -> "se",
"se" -> "fi")
val files = new HashMap[String,Set[String]] with MultiMap[String,String]
val bidixFiles = new HashMap[String,Set[NodeSeq]] with MultiMap[String,NodeSeq]
var bidixTerms: Set[String] = (bidixXml \\ "l").map(_.text).toSet.to[collection.mutable.Set]
iterator.foreach { term =>
bidix.foreach { pair =>
var src: String = ""
var trg: String = ""
var srcAnalysis: List[String] = List()
var trgAnalysis: List[String] = List()
val reFilter = ".*\\+[Sg|Pl].*\\+[Nom|Indef].*|.*\\+V(\\+TV|\\+IV)\\+Inf.*"
term.entries foreach { e =>
e.language match {
case pair._1 => src = e.expression // if (!(bidixTerms \\ "l").map(_.text).toSet.contains(e.expression)) =>
case pair._2 => trg = e.expression
case _ =>
}
}
if (pair._1 == "se" && !bidixTerms.exists(_ == src)) {
if (src != "") {
// bidixTerms += src
srcAnalysis = analysers(pair._1).analyse(src).filter(x => (x.split("\\+")(0) == src && (x matches reFilter))).sorted.distinct
}
if (trg != "") trgAnalysis = analysers(pair._2).analyse(trg).filter(x => (x.split("\\+")(0) == trg && (x matches reFilter))).sorted.distinct
}
srcAnalysis foreach { s =>
trgAnalysis foreach { t =>
if ((s.size != 0 && t.size != 0))
bidixFiles.addBinding(pair._1 + "-" + pair._2, toBidixXml(term, s, t))
}
}
}
pairs.foreach { pair =>
val srclng = pair._1
val trglng = pair._2
var src = ""
var trg = ""
var defn = ""
var expl = ""
term.entries foreach { e =>
e.language match {
case `srclng` =>
src = e.expression
defn = Option(e.definition).getOrElse("")
expl = Option(e.explanation).getOrElse("")
case `trglng` => trg = e.expression
case _ =>
}
}
if (src != "" && trg != "") {
val line = src + "\t" + trg + "\t" + term.category + " " + expl + " " + defn
files.addBinding(srclng + "-" + trglng, line)
}
}
}
iterator.close()
bidixFiles foreach { f =>
// val pp = new PrettyPrinter(80, 2)
val fileoutput = new FileOutputStream("bidix/" + f._1 + ".dix")
val filewriter = Channels.newWriter(fileoutput.getChannel(), "UTF-8")
val sdefs: Set[NodeSeq] = f._2 flatMap {_ \\ "s"}
try {
filewriter.write("\n")
filewriter.write("\n")
filewriter.write("\n")
sdefs foreach {s => filewriter.write(rename("sdef", s).toString + "\n")}
filewriter.write("\n")
filewriter.write("\n")
f._2.toSeq.sortBy(x => (x \\ "l").text) foreach {x => filewriter.write(x.toString + "\n")}
filewriter.write("\n")
filewriter.write("\n")
} finally {
filewriter.close()
}
}
files foreach {f =>
val fos = new FileOutputStream("omegat/" + f._1 + ".txt")
val writer = Channels.newWriter(fos.getChannel(), "UTF-8")
try {
f._2 foreach {x => writer.write(x); writer.write("\n")}
} finally {
writer.close()
}
}
private def rename(label: String, n: NodeSeq): NodeSeq = n match {
case Elem(prefix, _, attributes, scope, child @ _*) =>
Elem(prefix, label, attributes, scope, true, child: _*)
case _ => n
}
private def toBidixXml(term: Term, src: String, trg: String): NodeSeq = {
val reStem = """([^\+]+)(\+.*)""".r
val reAnal = """([^\+]+)""".r
try {
val reStem(l, lAnal) = src.split("\\s+")(0)
val reStem(r, rAnal) = trg.split("\\s+")(0)
val sLeft = reAnal.findAllIn(lAnal).map(posToBidix(_))
val sRight = reAnal.findAllIn(rAnal).map(posToBidix(_))
// var chpos = ""
val chpos = if (reAnal.findFirstIn(rAnal) != reAnal.findFirstIn(lAnal))
Comment("CHECK THIS") else NodeSeq.Empty
return {l}{sLeft}{r}{sRight}
++ {chpos}
} catch {
case ex: MatchError =>
println("MatchError: [" + src + ", " + trg + "]")
}
NodeSeq.Empty
}
private def posToBidix(pos: String): NodeSeq = pos match {
case "N" =>
case "A" =>
case "V" =>
case "TV" =>
case "IV" =>
// case _ =>
case _ => NodeSeq.Empty
}
}