#!/usr/bin/env python # -*- coding: UTF-8 -*- import sys, codecs, re, xml.sax.saxutils, os from optparse import OptionParser import mdparser decoding_map = codecs.make_identity_dict(range(256)) decoding_map.update({ ord('Q'): 230, # "a" from "man" ord('W'): 695 , # "w" ord('A'): 593, # "a" from "past" 249: ord(':'), # ":" 171: 601, # "e" from "her" ord('E'): 603, # "e" first from diphthong in "care" 141: 596, # "o" from "wash" 195: 652, # "a" from "son" ord('I'): 618, # "i" from "ink" 200: 712, # "'" 199: 716, # "," ord('H'): 688, # "h" ord('Z'): 658, # "z" ord('N'): 331, # "ng" ord('S'): 643, # "sh" ord('D'): 240, # "th" with voice ord('T'): 952, # "th" }) encoding_map = codecs.make_encoding_map(decoding_map) class Codec(codecs.Codec): def encode(self,input,errors='strict'): return codecs.charmap_encode(input,errors,encoding_map) def decode(self,input,errors='strict'): return codecs.charmap_decode(input,errors,decoding_map) class StreamWriter(Codec,codecs.StreamWriter): pass class StreamReader(Codec,codecs.StreamReader): pass def find_sil_codec(name): if (name=='sil'): return (Codec().encode,Codec().decode,StreamReader,StreamWriter) else: return none codecs.register(find_sil_codec) def decode_from_koi(str): return str.decode("koi8-r").encode("utf-8") parser = mdparser.MakeDictParser() parser.parser_info['version']='0.2' parser.parser_info['dict_name']='Mueller7GPL.koi' parser.parser_info['format']='mueller7' parser.start() description=decode_from_koi(parser.f.readline())+\ decode_from_koi(parser.f.readline())+\ decode_from_koi(parser.f.readline()).rstrip() parser.begin() parser.set_dict_info('full_name', 'Англо-русский словарь Мюллера, 7-я ред.') parser.set_dict_info('lang_from', 'ENG') parser.set_dict_info('lang_to', 'RUS') parser.set_dict_info('description', xml.sax.saxutils.escape(description)) parser.info() parser.abbrs_begin() art_beg='' art_end='' key_end='' data_beg='' data_end='' read_abbr = 1 abbr_list={} word_pat=re.compile("([\w\.-]+)", re.UNICODE) ind_pat=re.compile("(?P\w+>)", re.UNICODE) ind_pat2=re.compile("(?P\d+\.)", re.UNICODE) ex_pat=re.compile("(?P\s+[a-zA-Z0-9\.']+[a-zA-Z0-9\.'\s]+)", re.UNICODE) #semicolon_pat=re.compile(";", re.UNICODE) for line in parser.f.readlines(): esc=re.compile('\[(.*?)\]') dosenc=esc.split(line) i=-1 res='' for sub in dosenc: i=i+1 beg_tr='' if i==1: end_tr='\n' else: end_tr='' if (i % 2): res=res+''+xml.sax.saxutils.escape(sub.decode('sil'))+end_tr else: res=res+xml.sax.saxutils.escape(sub.decode('koi8-r')) # rez=''.join([x.decode('koi8-r')+decode_from_sil(y) for (x,y) in zip(dosenc,specenc)]) # #key=decode_from_koi(key) key, data=res.split(" ") data=data.rstrip() new_data='' if read_abbr==1: abbr_list[unicode(key)]=u''+unicode(key)+u'' else: word_list=word_pat.split(data) for w in word_list: if w in abbr_list: new_data = new_data+abbr_list[w] else: new_data = new_data+w data=new_data data=ind_pat.sub("\n\g", data) data=ind_pat2.sub("\n\g", data) data=ex_pat.sub("\g", data) print art_beg+''+xml.sax.saxutils.escape(key.encode('utf-8'))+key_end+data_beg+data.encode('utf-8')+data_end+art_end if key==u'_яп.': parser.abbrs_end() read_abbr=0 art_beg="" art_end="" data_beg="" data_end="" key_end='\n' parser.end()