#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys, codecs, re, xml.sax.saxutils, os
from optparse import OptionParser
import mdparser
decoding_map = codecs.make_identity_dict(range(256))
decoding_map.update({
ord('Q'): 230, # "a" from "man"
ord('W'): 695 , # "w"
ord('A'): 593, # "a" from "past"
249: ord(':'), # ":"
171: 601, # "e" from "her"
ord('E'): 603, # "e" first from diphthong in "care"
141: 596, # "o" from "wash"
195: 652, # "a" from "son"
ord('I'): 618, # "i" from "ink"
200: 712, # "'"
199: 716, # ","
ord('H'): 688, # "h"
ord('Z'): 658, # "z"
ord('N'): 331, # "ng"
ord('S'): 643, # "sh"
ord('D'): 240, # "th" with voice
ord('T'): 952, # "th"
})
encoding_map = codecs.make_encoding_map(decoding_map)
class Codec(codecs.Codec):
def encode(self,input,errors='strict'):
return codecs.charmap_encode(input,errors,encoding_map)
def decode(self,input,errors='strict'):
return codecs.charmap_decode(input,errors,decoding_map)
class StreamWriter(Codec,codecs.StreamWriter):
pass
class StreamReader(Codec,codecs.StreamReader):
pass
def find_sil_codec(name):
if (name=='sil'):
return (Codec().encode,Codec().decode,StreamReader,StreamWriter)
else:
return none
codecs.register(find_sil_codec)
def decode_from_koi(str):
return str.decode("koi8-r").encode("utf-8")
parser = mdparser.MakeDictParser()
parser.parser_info['version']='0.2'
parser.parser_info['dict_name']='Mueller7GPL.koi'
parser.parser_info['format']='mueller7'
parser.start()
description=decode_from_koi(parser.f.readline())+\
decode_from_koi(parser.f.readline())+\
decode_from_koi(parser.f.readline()).rstrip()
parser.begin()
parser.set_dict_info('full_name', 'Англо-русский словарь Мюллера, 7-я ред.')
parser.set_dict_info('lang_from', 'ENG')
parser.set_dict_info('lang_to', 'RUS')
parser.set_dict_info('description', xml.sax.saxutils.escape(description))
parser.info()
parser.abbrs_begin()
art_beg=''
art_end=''
key_end=''
data_beg=''
data_end=''
read_abbr = 1
abbr_list={}
word_pat=re.compile("([\w\.-]+)", re.UNICODE)
ind_pat=re.compile("(?P\w+>)", re.UNICODE)
ind_pat2=re.compile("(?P\d+\.)", re.UNICODE)
ex_pat=re.compile("(?P\s+[a-zA-Z0-9\.']+[a-zA-Z0-9\.'\s]+)", re.UNICODE)
#semicolon_pat=re.compile(";", re.UNICODE)
for line in parser.f.readlines():
esc=re.compile('\[(.*?)\]')
dosenc=esc.split(line)
i=-1
res=''
for sub in dosenc:
i=i+1
beg_tr=''
if i==1:
end_tr='\n'
else:
end_tr=''
if (i % 2):
res=res+''+xml.sax.saxutils.escape(sub.decode('sil'))+end_tr
else:
res=res+xml.sax.saxutils.escape(sub.decode('koi8-r'))
# rez=''.join([x.decode('koi8-r')+decode_from_sil(y) for (x,y) in zip(dosenc,specenc)])
#
#key=decode_from_koi(key)
key, data=res.split(" ")
data=data.rstrip()
new_data=''
if read_abbr==1:
abbr_list[unicode(key)]=u''+unicode(key)+u''
else:
word_list=word_pat.split(data)
for w in word_list:
if w in abbr_list:
new_data = new_data+abbr_list[w]
else:
new_data = new_data+w
data=new_data
data=ind_pat.sub("\n\g", data)
data=ind_pat2.sub("\n\g", data)
data=ex_pat.sub("\g", data)
print art_beg+''+xml.sax.saxutils.escape(key.encode('utf-8'))+key_end+data_beg+data.encode('utf-8')+data_end+art_end
if key==u'_яп.':
parser.abbrs_end()
read_abbr=0
art_beg=""
art_end=""
data_beg=""
data_end=""
key_end='\n'
parser.end()