import scala.annotation.tailrec import scala.io.Source import scala.xml.pull._ import scala.xml.{XML, NodeSeq} case class Entry( val expression: String, val language: String, val status: String = null, val sanctioned: String = null, val pos: String = null, var definition: String, var explanation: String) { def toXml: NodeSeq = if (sanctioned == "True") { val aPos = Option(pos).filter(_.nonEmpty).map("\\" + _).getOrElse("") val xi = XML.loadString("") {xi} } else NodeSeq.Empty def toXmlInLang(title: String): NodeSeq = if (sanctioned == "True") { val aPos = Option(pos).filter(_.nonEmpty).map("\\" + _).getOrElse("") val aDef = Option(definition).filter(_.nonEmpty).map(s => {s}).getOrElse("") val aExpl = Option(explanation).filter(_.nonEmpty).map(s => {s}).getOrElse("") {expression} {aDef} {aExpl} } else NodeSeq.Empty } object Entry { def fromMap(m: Map[String, String]): Entry = { Entry(m.getOrElse("expression", "").asInstanceOf[String], m.getOrElse("language", "").asInstanceOf[String], m.getOrElse("status", null).asInstanceOf[String], m.getOrElse("sanctioned", null).asInstanceOf[String], m.getOrElse("pos", null), definition = null, explanation = null) } } case class Term( val title: String, val category: String, val entries: Set[Entry]) { def this() = this("", "", Set()) def toXml: NodeSeq = { val aEntries = entries.map(_.toXml)//.sortBy { case e => (e.language) } if (aEntries.size != aEntries.count(_ == NodeSeq.Empty)) { {aEntries} } else NodeSeq.Empty } } trait CloseableIterator[A] extends Iterator[A] { def close(): Unit } object TermwikiParser { def iterator(file: String) = new CloseableIterator[Term] { val xml = new XMLEventReader(Source.fromFile(file)) var xmlStream = xml.toStream var currentTerm: Option[Term] = rawTerm() private def rawTerm(): Option[Term] = { @tailrec def rawTerm0(head: XMLEvent, stream: Stream[XMLEvent], inPage: Boolean, inPageTitle: Boolean, inPageText: Boolean, accumulator: Term): Option[Term] = head match { case EvElemStart(_, "page", _, _) => xmlStream = stream rawTerm0(stream.head, stream.tail, true, false, false, accumulator) case EvElemStart(_, "title", _, _) if inPage => rawTerm0(stream.head, stream.tail, true, true, false, accumulator) case EvElemStart(_, "text", _, _) if inPage => rawTerm0(stream.head, stream.tail, true, false, true, accumulator) case EvText(text) if inPageTitle => rawTerm0(stream.head, stream.tail, true, true, false, accumulator.copy(title = text, category = text.split(':')(0))) case EvText(text) if inPageText => rawTerm0(stream.head, stream.tail, true, false, true, TermParser.parseTerm(accumulator, text)) case EvElemEnd(_, "page") => xmlStream = stream Option(accumulator) case EvElemEnd(_, "title") => rawTerm0(stream.head, stream.tail, true, false, false, accumulator) case EvElemEnd(_, "text") => rawTerm0(stream.head, stream.tail, true, false, false, accumulator) case EvElemEnd(_, "mediawiki") => xmlStream = stream.tail None case _ => rawTerm0(stream.head, stream.tail, inPage, inPageTitle, inPageText, accumulator) } if (xmlStream.isEmpty) None else rawTerm0(xmlStream.head, xmlStream.tail, false, false, false, new Term()) } override def hasNext: Boolean = currentTerm.isDefined override def next(): Term = { val tmp = currentTerm.get currentTerm = rawTerm() tmp } override def close(): Unit = { xml.stop() } } } object TermParser { def parseTerm(accTerm: Term, text: String): Term = { val reBlock = """(?s)\{\{([^\}]*?)\}\}""".r val reConcept = """(?s)^\{\{Concept(.*)\}\}""".r val reExp = """(?s)^\{\{Related expression(.*)\}\}""".r var entries:Set[Entry] = Set() var fields:Map[String, String] = Map() reBlock.findAllIn(text).foreach { m => m match { case reConcept(m) => fields = fieldMap(m) case reExp(m) => entries += Entry.fromMap(fieldMap(m)) case _ => } } val newEntries = parseFieldMap(entries, fields) accTerm.copy(entries = newEntries) } private def fieldMap(s: String): Map[String, String] = { val rePattern = """\|([^=]+)=(.*)""".r rePattern.findAllIn(s).matchData.map(_.subgroups).map { case List(a,b) => (a,b) }.toMap } private def parseFieldMap(entries: Set[Entry], fields: Map[String, String]): Set[Entry] = { val reExtractLang = """(.*)_(.*)""".r fields foreach {f => f._1 match { case reExtractLang(a,b) => entries.find(_.language == b).map { e => a match { case "definition" => e.definition = f._2 case "explanation" => e.explanation = f._2 case _ => } } case _ => } } entries } }