/* * This file is part of makedict - convertor from any * dictionary format to any http://sdcv.sourceforge.net * * Copyright (C) Evgeniy Dushistov, 2005 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* * module for work with dsl format(http://lingvo.ru) */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include #include #include using namespace std; #include #include "charset_conv.hpp" #include "mapfile.hpp" #include "normalize_tags.hpp" #include "utils.hpp" #include "parser.hpp" //#define DEBUG class dsl_parser : public makedict_parser { public: dsl_parser(); ~dsl_parser() {} protected: bool is_my_format(const std::string& url) { return g_str_has_suffix(url.c_str(), ".dsl"); } int parse(const string& filename); private: typedef enum { NAME=0, INDEX_LANGUAGE=1, CONTENTS_LANGUAGE=2, SOURCE_CODE_PAGE=3, UNKNOWN=4 } enHeaderTags; static TagInfoList taginfo_list; string basename; bool utf16;//mean that source file in utf16 bool little_endian;//mean that order of bytes in little_endian guint32 linenum;//number of current line string line; string name, index_language, contents_language; string from_codeset; static Str2StrTable code_page_table; static Str2StrTable short_lang_table; static Str2StrTable replace_table; typedef map > Char2Str; static Char2Str text2xml; char *end_of_file; bool not_close_comment; bool getline(MapFile& in); enHeaderTags is_line_has_tag(void); bool get_tag_value(const char* tag_name, string& value); static bool long_to_short(string& longlang, string& shortlang); //this is a real function which parse file, //because of in additition to dsl by itself, //may be need parse file with abbrevations, //which have the same format int parse(MapFile& in, bool only_info, bool abr); int print_info(); bool parse_header(MapFile& in, CharsetConv& conv); bool read_keys(MapFile& in, const CharsetConv& conv, vector& key_list); void utf16_to_machine(gint16 &ch); }; Str2StrTable dsl_parser::code_page_table; Str2StrTable dsl_parser::replace_table; Str2StrTable dsl_parser::short_lang_table; dsl_parser::Char2Str dsl_parser::text2xml; TagInfoList dsl_parser::taginfo_list; dsl_parser::dsl_parser() { not_close_comment=false; set_parser_info("format", "dsl"); set_parser_info("version", "dsl_parser, version 0.1"); parser_options["full_name"]=""; parser_options["encoding"]=""; parser_options["lang_from"]=""; parser_options["lang_to"]=""; if (!code_page_table.empty()) return; code_page_table["Latin"]="CP1252"; code_page_table["Cyrillic"]="CP1251"; code_page_table["EasternEuropean"]="CP1250"; short_lang_table["afrikaans"]="AFR"; short_lang_table["basque"]="BAQ"; short_lang_table["belarusian"]="BEL"; short_lang_table["bulgarian"]="BUL"; short_lang_table["czech"]="CZE"; short_lang_table["danish"]="DAN"; short_lang_table["dutch"]="DUT"; short_lang_table["english"]="ENG"; short_lang_table["finnish"]="FIN"; short_lang_table["french"]="FRA"; short_lang_table["german"]="GER"; short_lang_table["germannewspelling"]="GER"; short_lang_table["hungarian"]="HUN"; short_lang_table["indonesian"]="IND"; short_lang_table["italian"]="ITA"; short_lang_table["norwegianbokmal"]="NOB"; short_lang_table["norwegiannynorsk"]="NNO"; short_lang_table["polish"]="POL"; short_lang_table["portuguesestandard"]="POR"; short_lang_table["russian"]="RUS"; short_lang_table["serbiancyrillic"]="SCC"; short_lang_table["spanishmodernsort"]="SPA"; short_lang_table["spanishtraditionalsort"]="SPA"; short_lang_table["swahili"]="SWA"; short_lang_table["swedish"]="SWE"; short_lang_table["ukrainian"]="UKR"; replace_table["[/m]"]=""; #if 0 replace_table["[ref]"]=""; replace_table["[/ref]"]=""; #endif replace_table["[url]"]=""; replace_table["[/url]"]=""; replace_table["[!trs]"]=""; replace_table["[/!trs]"]=""; replace_table["[/lang]"]=""; replace_table["[*]"]=""; replace_table["[/*]"]=""; replace_table["{{"]=""; replace_table["<<"]=""; replace_table[">>"]=""; replace_table["[s]"]=""; replace_table["[/s]"]=""; replace_table["[m]"]="";//handle errors in dsl replace_table["[m1]"]=" "; replace_table["[m2]"]=" "; replace_table["[m3]"]=" "; replace_table["[m4]"]=" "; replace_table["[m5]"]=" "; replace_table["[m6]"]=" "; replace_table["[m7]"]=" "; replace_table["[m8]"]=" "; replace_table["[m9]"]=" "; text2xml['<']="<"; text2xml['>']=">"; text2xml['&']="&"; text2xml['\"']="""; taginfo_list.push_back(TagInfo("[b]", "[/b]", "", "", TagInfo::tB)); taginfo_list.push_back(TagInfo("[i]", "[/i]", "", "", TagInfo::tI)); taginfo_list.push_back(TagInfo("[t]", "[/t]", "", "", TagInfo::tTranscription)); taginfo_list.push_back(TagInfo("[com]", "[/com]", "", "", TagInfo::tComment)); taginfo_list.push_back(TagInfo("[c", "[/c]", "", "", TagInfo::tColor, true)); taginfo_list.push_back(TagInfo("[p]", "[/p]", "", "", TagInfo::tAbr)); taginfo_list.push_back(TagInfo("[sub]", "[/sub]", "_{", "}", TagInfo::tSub)); taginfo_list.push_back(TagInfo("[sup]", "[/sup]", "^{", "}", TagInfo::tSup)); taginfo_list.push_back(TagInfo("[trn]", "[/trn]", "", "", TagInfo::tDtrn)); taginfo_list.push_back(TagInfo("[ex]", "[/ex]", "", "", TagInfo::tExample)); taginfo_list.push_back(TagInfo("[u]", "[/u]", "", "", TagInfo::tColor)); taginfo_list.push_back(TagInfo("[ref", "[/ref]", "", "", TagInfo::tKref, true)); } //convert from Long language name, like English //to short, like eng inline bool dsl_parser::long_to_short(string& longlang, string& shortlang) { tolower(longlang); Str2StrTable::iterator lang=short_lang_table.find(longlang.c_str()); if (lang==short_lang_table.end()) { cerr<<_("Unknwon language: ")<first<<'\t'; cerr<second; return true; } int dsl_parser::parse(const string& filename) { int res=EXIT_FAILURE; basename.assign(filename); string::size_type pos=basename.rfind('.'); if (pos!=string::size_type(-1)) basename.erase(pos, basename.length()-pos); //search icon string icon_name=basename+".bmp"; if (is_file_exist(icon_name)) set_dict_info("icon", icon_name); string dirname(filename); pos=dirname.rfind(G_DIR_SEPARATOR); if (pos!=string::size_type(-1)) dirname.erase(pos, dirname.length()-pos); else dirname="."; { MapFile in; if (!in.open(filename.c_str())) { cerr<<_("Can not open file: ")< annot_buf(annotation.tellg()); annotation.seekg(0); annotation.read(&annot_buf[0], annot_buf.size()); string anot_str(&annot_buf[0], annot_buf.size()); vector from_codesets; if (!from_codeset.empty()) from_codesets.push_back(from_codeset); from_codesets.push_back("UTF-16"); from_codesets.push_back("UCS-2"); string convstr; vector::iterator it; for (it=from_codesets.begin(); it!=from_codesets.end(); ++it) { CharsetConv conv(it->c_str(), "UTF-8"); convstr.clear(); if (!conv.convert(anot_str, convstr)) convstr=anot_str; if (g_utf8_validate(convstr.c_str(), convstr.length(), NULL)) break; } if (it==from_codesets.end()) { gchar *mes=g_strdup_printf(_("Annotation of dictionary is not in %s\n" "Recode it to %s, and try again\n"), from_codesets[0].c_str(), from_codesets[0].c_str()); cerr<"]=">"; str2xml["&"]="&"; str2xml["\""]="""; replace(str2xml, new_convstr.c_str(), convstr); set_dict_info("description", convstr); } begin(); return EXIT_SUCCESS; } inline void dsl_parser::utf16_to_machine(gint16 &ch) { if (little_endian) ch=GINT16_FROM_LE(ch); else ch=GINT16_FROM_BE(ch); } //read one line from file //and convert it to utf8, if file in utf16 bool dsl_parser::getline(MapFile& in) { reread_line: ++linenum; line.clear(); if (!utf16) { char ch; while (in.cur(in.cur); in.cur+=sizeof(ch); utf16_to_machine(ch); if (ch=='\r' || ch=='\n') break; line+=*reinterpret_cast(&ch); line+=*(reinterpret_cast(&ch)+1); } if (in.cur+1(in.cur); utf16_to_machine(ch); if (ch=='\r' || ch=='\n') in.cur+=sizeof(ch); } string convstr; if (conv.convert(line, convstr)) line=convstr; } if (not_close_comment) { string::size_type com_end=line.find("}}"); if (com_end==string::npos) { if (!(in.cur=line.length()) { cerr<first<second; conv.workwith(from_codeset.c_str(), "UTF-8"); utf16=false; } break; default: /*this not should happen*/; }//switch (tag) { } return true; } bool dsl_parser::read_keys(MapFile& in, const CharsetConv& conv, vector& key_list) { Char2Str::iterator c2si; string *cur_line; do { string utf8str; if (!from_codeset.empty()) { if (!conv.convert(line, utf8str)) { gchar *mes=g_strdup_printf(_("Can not convert from %s to UTF-8"), from_codeset.c_str()); cerr<c_str(), gssize(-1), NULL)) { cerr<<_("Not valid UTF-8 string: ")<c_str(); while (*p) { if (*p=='\\') {//skip first '\' if (!(*++p)) break; goto add_char; } else if (*p=='(') { key_enc+=""; } else if (*p==')') { key_enc+=""; } else { add_char: if ((c2si=text2xml.find(*p))==text2xml.end()) key_enc+=*p; else key_enc+=c2si->second; } ++p; } //remove last blank characters string::reverse_iterator ri; for (ri=key_enc.rbegin(); ri!=key_enc.rend(); ++ri) if (*ri!=' ' && *ri!='\t') break; key_enc.erase(ri.base(), key_enc.end()); if (!key_enc.empty()) key_list.push_back(key_enc); } while (getline(in) && line[0]!='\t' && line[0]!=' '); return true; } int dsl_parser::parse(MapFile& in, bool only_info, bool abr) { linenum=0; end_of_file=in.end(); int res=EXIT_FAILURE; CharsetConv conv; //try to determine encoding //TODO: add utf-32 support int ch1; if (in.cur key_list; if (!read_keys(in, conv, key_list)) return res; if (!in && line[0]!='\t' && line[0]!=' ') break; string datastr; do { const char *real_begin=line.c_str()+1; while (*real_begin && (*real_begin=='\t' || *real_begin==' ')) ++real_begin; if (*real_begin) datastr +=string(real_begin)+"\n"; } while (getline(in) && (line[0]=='\t' || line[0]==' ')); if (in.eof() && !line.empty() && (line[0]=='\t' || line[0]==' ')) { const char *real_begin=line.c_str()+1; while (*real_begin && (*real_begin=='\t' || *real_begin==' ')) ++real_begin; if (*real_begin) datastr +=string(real_begin)+"\n"; } if (!from_codeset.empty()) { string encoded_str; if (!conv.convert(datastr, encoded_str)) { gchar *mes=g_strdup_printf(_("Can not convert from %s to UTF-8"), from_codeset.c_str()); cerr<second; ++p; } } else if (*p=='~') { resstr+=key_list.back(); ++p; } else if (*p=='[') {//may be this is a tag ? if (*(p+1)!='/') {//not close tag if (strncmp(p, "[lang", sizeof("[lang")-1)==0) { //just skip it const char *closed_braket=strchr(p, ']'); if (closed_braket!=NULL) p=closed_braket+1; else cerr<<_("Tag [lang didn't closed")<p && (*not_blank=='\t' || *not_blank==' ')) --not_blank; string key(p, not_blank-p); resstr+=string("")+""+key+"\n"; have_subarticle=true; p=end_of_line+1; continue; } else { if (resstr[resstr.length()-1]=='\n') resstr.erase(resstr.begin()+resstr.length()-1); resstr+=""; have_subarticle=false; ++p; while (*p && (*p=='\t' || *p==' ')) ++p; if (*p=='\n') { ++p; continue; } goto subarticle_key; } } else if (strncmp(p, "^~", 2)==0) { gunichar ch=g_utf8_get_char(key_list.back().c_str()); if (g_unichar_islower(ch)) ch=g_unichar_toupper(ch); else if (g_unichar_isupper(ch)) ch=g_unichar_tolower(ch); char buf[7]; gint size=g_unichar_to_utf8(ch, buf); buf[size]='\0'; resstr+=string(buf)+g_utf8_next_char(key_list.back().c_str()); p+=2; } else { end_of_handle: const char *beg=p; Str2StrTable::const_iterator i; for (i=replace_table.begin(); i!=replace_table.end(); ++i) { p=beg; const char *q=i->first; while (*p && *q && *p==*q) ++p, ++q; if (*q=='\0') { resstr+=i->second; break; } } if (i==replace_table.end()) { p=beg; if ((c2si=text2xml.find(*p))==text2xml.end()) resstr+=*p; else resstr+=c2si->second; ++p; } } } if (resstr[resstr.length()-1]=='\n') resstr.erase(resstr.begin()+resstr.length()-1); norm_tags(resstr, datastr); if (!g_utf8_validate(datastr.c_str(), gssize(-1), NULL)) { cerr<<_("Not valid UTF-8 string: ")<