object termwiki2xml extends App { import java.io.{File,FileOutputStream} import java.nio.channels.Channels import scala.xml._ import scala.collection.mutable.{HashMap,Queue,Set} case class Entry(val exp: String, val lang: String) case class Term(val id: String, val title: String, val namespace: Int, val entries: Seq[Entry]) def termwiki():Elem = { XML.loadFile(args(0)) } def wt2xml(elem:Node):Queue[Node] = { val terms = elem.text.split("\n") val entry = HashMap.empty[String, String] var nodes = Queue[Node]() terms.foreach(ar => { if (ar.startsWith("{{")) { entry += ("Name" -> ar.substring(2)) } else if (ar.startsWith("}}{{")) { nodes += entry2xml(entry) entry.retain((k, v) => v.length < 1) entry += ("Name" -> ar.substring(4)) } else if (ar.startsWith("|")) { entry += (ar.substring(ar.indexOf("|")+1, ar.indexOf("=")) -> ar.substring(ar.indexOf("=")+1)) } else if (ar.startsWith("}}")) { nodes += entry2xml(entry) entry.retain((k, v) => v.length < 1) } // else {println(ar)} }) return nodes } def entry2xml(map:HashMap[String, String]):Node = { var nodes = Queue[Node]() val title = map.remove("Name").get //println(title) map.foreach{ case(k,v) => nodes += createNode(k,v) } // val language = (nodes \ "lang").text // val expression = (nodes \ "expression").text // if (language == "" || expression == "" ) { // println("Empty node") // // } if (title == "Concept") { println("Concept: " + nodes) return {nodes} } else { println(nodes) return {nodes} } } def createNode(arg: String, v: String): Node = arg match { case "definition" => {v} case "language" => {v} case "expression" => {v} case _ => {arg} } def save(node: Node, fileName: String) = { val pp = new PrettyPrinter(80, 2) val fos = new FileOutputStream(fileName) val writer = Channels.newWriter(fos.getChannel(), "UTF-8") try { // writer.write("\n") writer.write(pp.format(node)) } finally { writer.close() } fileName } val xml = termwiki val page = xml \\ "page" var colls = Set[Int]() var langs = Set[String]() var null_related = Set[String]() val wikit = (page).map { item => val id = item \ "id" val namespace = (item \ "ns").text val title = (item \ "title").text colls += namespace.toInt val categories = (item \ "revision" \ "text").last {wt2xml(categories)} } val terms = (wikit \\ "Term" ).map { entry => val id = (entry \ "@id").text var title = "" try { title = (entry \ "@title").text.split(':')(1) } catch { case e: ArrayIndexOutOfBoundsException => { println(id) } } val namespace = (entry \ "@ns").text.toInt var entries = (entry \\ "Related").map { item => val exp = (item \\ "expression").text val lang = (item \\ "lang").text null_related += (item \\ "null").text langs += lang if (exp != "" && lang != "") { Entry(exp, lang) } else null } // entries += (entry \\ "Concept").map { item => // val // } // println(entries) Term(id, title, namespace, entries) } def termCenter(ns: Int):Elem = {terms.map { term => if (term.namespace == ns) { { term.entries.map { entry => if (entry != null) { } }} } }} def termsLang(lang: String, ns: Int):Elem = {terms.map { term => if (term.namespace == ns) { term.entries.map { entry => if (entry != null && entry.lang == lang) { {entry.exp} } }} }} def meta(id: String):Elem = termwiki {id} numeric {id} yes // val langs = List(terms foreach { term => // term.entries foreach { entry => // entry.lang // } // }) println(null_related) colls.foreach(coll => { val folder = new File("terms/termwiki-" + coll) if (!folder.isDirectory()) { folder.mkdir() } save(meta("termwiki-" + coll), "terms/termwiki-" + coll + "/meta.xml") save(termCenter(coll), "terms/termwiki-" + coll + "/termcenter.xml") langs.foreach(lang => save(termsLang(lang, coll), "terms/termwiki-" + coll + "/terms-" + lang + ".xml")) }) }