import scala.annotation.tailrec
import scala.io.Source
import scala.xml.pull._
import scala.xml.{XML, NodeSeq}
case class Entry(
val expression: String,
val language: String,
val status: String = null,
val sanctioned: String = null,
val pos: String = null,
var definition: String,
var explanation: String) {
def toXml: NodeSeq =
if (sanctioned == "True") {
val aPos = Option(pos).filter(_.nonEmpty).map("\\" + _).getOrElse("")
val xi = XML.loadString("")
{xi}
} else NodeSeq.Empty
def toXmlInLang(title: String): NodeSeq =
if (sanctioned == "True") {
val aPos = Option(pos).filter(_.nonEmpty).map("\\" + _).getOrElse("")
val aDef = Option(definition).filter(_.nonEmpty).map(s => {s}).getOrElse("")
val aExpl = Option(explanation).filter(_.nonEmpty).map(s => {s}).getOrElse("")
{expression}
{aDef}
{aExpl}
} else NodeSeq.Empty
}
object Entry {
def fromMap(m: Map[String, String]): Entry = {
Entry(m.getOrElse("expression", "").asInstanceOf[String],
m.getOrElse("language", "").asInstanceOf[String],
m.getOrElse("status", null).asInstanceOf[String],
m.getOrElse("sanctioned", null).asInstanceOf[String],
m.getOrElse("pos", null),
definition = null, explanation = null)
}
}
case class Term(
val title: String,
val category: String,
val entries: Set[Entry]) {
def this() = this("", "", Set())
def toXml: NodeSeq = {
val aEntries = entries.map(_.toXml)//.sortBy { case e => (e.language) }
if (aEntries.size != aEntries.count(_ == NodeSeq.Empty)) {
{aEntries}
} else NodeSeq.Empty
}
}
trait CloseableIterator[A] extends Iterator[A] {
def close(): Unit
}
object TermwikiParser {
def iterator(file: String) = new CloseableIterator[Term] {
val xml = new XMLEventReader(Source.fromFile(file))
var xmlStream = xml.toStream
var currentTerm: Option[Term] = rawTerm()
private def rawTerm(): Option[Term] = {
@tailrec
def rawTerm0(head: XMLEvent, stream: Stream[XMLEvent],
inPage: Boolean, inPageTitle: Boolean, inPageText: Boolean,
accumulator: Term): Option[Term] =
head match {
case EvElemStart(_, "page", _, _) =>
xmlStream = stream
rawTerm0(stream.head, stream.tail, true, false, false, accumulator)
case EvElemStart(_, "title", _, _) if inPage =>
rawTerm0(stream.head, stream.tail, true, true, false, accumulator)
case EvElemStart(_, "text", _, _) if inPage =>
rawTerm0(stream.head, stream.tail, true, false, true, accumulator)
case EvText(text) if inPageTitle =>
rawTerm0(stream.head, stream.tail, true, true, false,
accumulator.copy(title = text, category = text.split(':')(0)))
case EvText(text) if inPageText =>
rawTerm0(stream.head, stream.tail, true, false, true,
TermParser.parseTerm(accumulator, text))
case EvElemEnd(_, "page") =>
xmlStream = stream
Option(accumulator)
case EvElemEnd(_, "title") =>
rawTerm0(stream.head, stream.tail, true, false, false, accumulator)
case EvElemEnd(_, "text") =>
rawTerm0(stream.head, stream.tail, true, false, false, accumulator)
case EvElemEnd(_, "mediawiki") =>
xmlStream = stream.tail
None
case _ =>
rawTerm0(stream.head, stream.tail, inPage, inPageTitle, inPageText, accumulator)
}
if (xmlStream.isEmpty) None
else rawTerm0(xmlStream.head, xmlStream.tail, false, false, false, new Term())
}
override def hasNext: Boolean = currentTerm.isDefined
override def next(): Term = {
val tmp = currentTerm.get
currentTerm = rawTerm()
tmp
}
override def close(): Unit = {
xml.stop()
}
}
}
object TermParser {
def parseTerm(accTerm: Term, text: String): Term = {
val reBlock = """(?s)\{\{([^\}]*?)\}\}""".r
val reConcept = """(?s)^\{\{Concept(.*)\}\}""".r
val reExp = """(?s)^\{\{Related expression(.*)\}\}""".r
var entries:Set[Entry] = Set()
var fields:Map[String, String] = Map()
reBlock.findAllIn(text).foreach { m =>
m match {
case reConcept(m) => fields = fieldMap(m)
case reExp(m) => entries += Entry.fromMap(fieldMap(m))
case _ =>
}
}
val newEntries = parseFieldMap(entries, fields)
accTerm.copy(entries = newEntries)
}
private def fieldMap(s: String): Map[String, String] = {
val rePattern = """\|([^=]+)=(.*)""".r
rePattern.findAllIn(s).matchData.map(_.subgroups).map {
case List(a,b) => (a,b)
}.toMap
}
private def parseFieldMap(entries: Set[Entry], fields: Map[String, String]): Set[Entry] = {
val reExtractLang = """(.*)_(.*)""".r
fields foreach {f =>
f._1 match {
case reExtractLang(a,b) =>
entries.find(_.language == b).map { e =>
a match {
case "definition" => e.definition = f._2
case "explanation" => e.explanation = f._2
case _ =>
}
}
case _ =>
}
}
entries
}
}