/* * This file is part of makedict - convertor from any * dictionary format to any http://sdcv.sourceforge.net * * Copyright (C) Evgeniy Dushistov, 2005 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif #include #include #include #include #include #include #include #include "langs_2to3.hpp" #include "normalize_tags.hpp" #include "utils.hpp" #include "parser.hpp" //#define DEBUG //#define TEST class sdict_parser : public makedict_parser { public: sdict_parser(); ~sdict_parser() {} protected: int parse(const std::string& filename); bool is_my_format(const std::string& url) { return g_str_has_suffix(url.c_str(), ".dct"); } private: enum {none, gzip, bzip2} compression_method; std::string title, copyright, version; std::vector data_buffer; static Str2StrTable replace_table; typedef std::map > Char2Str; static Char2Str text2xml; static TagInfoList taginfo_list; std::map langs; bool read_unit(std::ifstream& f, guint32 offset); std::string convert_lang(const std::string& lang) { std::string buf(lang); tolower(buf); std::map::iterator it=langs.find(buf); if (it==langs.end()) return "unkown language: "+lang; return it->second; } }; Str2StrTable sdict_parser::replace_table; sdict_parser::Char2Str sdict_parser::text2xml; TagInfoList sdict_parser::taginfo_list; sdict_parser::sdict_parser() : langs(langs_2to3, langs_2to3+langs_2to3_count) { set_parser_info("format", "sdict"); set_parser_info("version", "sdict_parser, version 0.1"); parser_options["lang_from"]=""; parser_options["lang_to"]=""; not_valid_chars.insert(0x02); not_valid_chars.insert(0x03); not_valid_chars.insert(0x05); not_valid_chars.insert(0x06); not_valid_chars.insert(0x07); not_valid_chars.insert(0x08); not_valid_chars.insert(0x0C); not_valid_chars.insert(0x0E); not_valid_chars.insert(0x0F); not_valid_chars.insert(0x14); not_valid_chars.insert(0x15); not_valid_chars.insert(0x16); not_valid_chars.insert(0x1B); not_valid_chars.insert(0x1C); not_valid_chars.insert(0x1D); if (!replace_table.empty()) return; replace_table["
"]=replace_table["
"]="\n"; replace_table["

"]="\n\n"; replace_table[""]=""; replace_table[""]=""; replace_table[""]=""; replace_table[""]=""; replace_table[""]=""; replace_table[""]=""; replace_table["

  • "]=""; replace_table["
  • "]=""; //FIXME: find out what it form of word and change to aporopriate tag replace_table[""]=""; replace_table[""]=""; replace_table["Î"]="Î"; taginfo_list.push_back(TagInfo("", "", "", "", TagInfo::tSub)); taginfo_list.push_back(TagInfo("", "", "", "", TagInfo::tSup)); taginfo_list.push_back(TagInfo("", "", "", "", TagInfo::tB)); taginfo_list.push_back(TagInfo("", "", "", "", TagInfo::tI)); taginfo_list.push_back(TagInfo("", "", "", "", TagInfo::tColor)); text2xml['<']="<"; text2xml['>']=">"; text2xml['&']="&"; text2xml['\"']="""; } int sdict_parser::parse(const std::string& filename) { guint16 next_word, prev_word; guint32 article_pointer, index_size; guint32 wordcount, short_index_length; guint32 title_offset, copyright_offset, version_offset, short_index_offset, full_index_offset, articles_offset; title=copyright=version=""; int res=EXIT_FAILURE; std::ifstream f(filename.c_str(), std::ios::binary | std::ios::in); if (!f) { std::cerr<<_("Can not open: ")<second; ++q; } remove_not_valid(encoded_index); long cur_offset=f.tellg(); if (!read_unit(f, articles_offset+article_pointer)) { std::cerr<<_("Can not read article")<first; while (*p && *q && *p==*q) ++p, ++q; if (*q=='\0') { encoded_data+=i->second; break; } } if (i==replace_table.end()) { p=beg; if ((c2si=text2xml.find(*p))==text2xml.end()) encoded_data+=*p; else encoded_data+=c2si->second; ++p; } } } { std::string datastr; norm_tags(encoded_data, datastr); encoded_data=datastr; } remove_not_valid(encoded_data); #if 1 if (!g_utf8_validate(encoded_data.c_str(), gssize(-1), NULL)) { std::cerr<<_("Not valid utf-8")< key_list(1, encoded_index); article(key_list, encoded_data); f.seekg(cur_offset); } res=EXIT_SUCCESS; return res; } bool sdict_parser::read_unit(std::ifstream& f, guint32 offset) { if (!f.seekg(offset)) { std::cerr<<_("Can not set current position to: ")< dest; dest.resize(record_size*4); uLongf dest_len; for(;;) { dest_len=dest.size(); int res=uncompress((Bytef *)&dest[0], (uLongf *)&dest_len, (Bytef *)&data_buffer[0], record_size); if (Z_OK==res) break; if (Z_BUF_ERROR==res) { dest.resize(dest.size()+record_size); continue; } return false; } data_buffer=dest; if (data_buffer.size()<=dest_len+1) data_buffer.resize(dest_len+1); data_buffer[dest_len]='\0'; } else { data_buffer[record_size]='\0'; } return true; } int main(int argc, char *argv[]) { sdict_parser parser; return parser.run(argc, argv); }