#!/usr/bin/env python import os import sys class set_calculator: def __init__(self, lang, path): """ Create an instance of the set_calculator class. Language specifies which language is to be processed. Path specifies which directory the files are read from and written to. """ self.setlist = [] self.diclist = [] self.afflist = [] self.sfx_dict = {} self.new_dic_dict = {} self.affstart = ['FLAG num', 'SET UTF-8', 'NEEDAFFIX 65529', 'COMPOUNDPERMITFLAG 65530', 'COMPOUNDBEGIN 65531', 'COMPOUNDMIDDLE 65532', 'COMPOUNDEND 65533', 'COMPOUNDFORBIDFLAG 65534', 'ONLYINCOMPOUND 65535'] self.open_files(lang, path) def open_files(self, lang, path): """ Specifies and opens the files that needs to be read and written dics contains all the stems that eventually will be written to the dic file. filter_plx_files has produced this file. ders contains all the derivational stems. These will eventually be written to the aff file. filter_plx_files has produced this file. affs contains all suffixes. These will eventually be written to the aff file. filter_plx_files has produced this file. phonrules contains the phonetic rules. This is a handwritten file and is language specific. dic and aff will become the hunspell dic and aff files. """ files_to_read = ['dics', 'ders', 'affs', 'phonrules'] files_to_write = [lang + '.dic', lang + '.aff'] if path[-1:] != '/': path = path + '/' self.dics = open(path + files_to_read[0]) self.ders = open(path + files_to_read[1]) self.affs = open(path + files_to_read[2]) self.phonrules = open(path + files_to_read[3]) self.dic = open(path + files_to_write[0], 'w') self.aff = open(path + files_to_write[1], 'w') def add_numbers_to_set(self, numberlist): """ Numberlist is list of strings. These strings are converted to integers and added to the set numbers. If the set is in self.setlist, return the index + 1 to it. If not, add the set to self.setlist and return the length self.setlist """ numbers = set() for i in numberlist: numbers.add(int(i)) try: index = self.setlist.index(numbers) index = index + 1 except ValueError: self.setlist.append(numbers) index = len(self.setlist) return index def read_files(self): """ Read the files that are produced by filter_plx_files. First the dics file is read. Each line in the dics file has a stem and an set of numbers divided by a slash. The numbers in the set correspond to a number at the beginning of a line in either the ders or the affs files. The number of stems are recorded in the variable dic_file_size. The stem and the index of the set belonging to the stem are concatenated and inserted into the list dicwords. The set belonging to each stem is inserted into self.setlist. The setlist contains only unique sets. Then the ders file is read. Each line in the ders file begins with a number, then has a stem and a set of numbers separated by a slash. The first number is a unique identifier for the stem, and corresponds to a number found in the sets in the dics file. The numbers in the set correspond to a number at the beginning of a line in the affs file. The set belonging to each stem is inserted into self.setlist. Then the stem + the resulting index is inserted to the dictionary sfx_dict. The key in sfx_dict is the unique identifier of the stem. Finally the affs file is read. Each line in the affs file begins with a number, then has a suffix and a set of numbers separated by a slash. This is followed by a dot and then by compounding information. The first number is a unique identifier for the suffix, and corresponds to a number found in the sets of either the dics or the ders file. The line is first split by the space character, then the second part of that split is split by the slash character. This way the suffix and the set of number are split apart. The set is appended to self.afflist and the index to it is returned. The index is calculated by adding the length of self.setlist. This way it is easy to add this to the list of AF's. This index and the first part of the suffix is now added to dictionary sfx_dict """ print "start: dics" dicwords = [] dic_file_size = 0 for line in self.dics: lastslash = line.rfind("/") count = line.count('/') if lastslash > 0 and count == 1: dic_file_size = dic_file_size + 1 index = self.add_numbers_to_set(line[lastslash+1:].strip().split(',')) dicwords.append(line[:lastslash] + '/' + str(index)) else: print "This should not occur, dics ", line self.dic.write(str(dic_file_size) + '\n') self.dic.write('\n'.join(dicwords)) self.dic.close() dicwords = [] print "start: ders" for line in self.ders: lastslash = line.rfind("/") if lastslash > 0: index = self.add_numbers_to_set(line[lastslash+1:].strip().split(',')) sfx = line[:lastslash].split(" ") self.sfx_dict[int(sfx[0])] = sfx[1] + '/' + str(index) + ' .' else: print "This should not occur, ders ", line print "start: affs" index = len(self.setlist) for line in self.affs: splits = line.strip().split(' ') if splits[1].find('/') > 0: suffix = splits[1].split('/') try: index = self.afflist.index(suffix[1]) index = index + 1 + len(self.setlist) except ValueError: self.afflist.append(suffix[1]) index = len(self.afflist) + len(self.setlist) splits[1] = suffix[0] + '/' + str(index) self.sfx_dict[int(splits[0])] = ' '.join(splits[1:]) def calculate_sets(self): """ Calculate a new list of sets from the sets in self.setlist. The new set list contains all the numbers that are in self.setlist, but is divided into as big pieces as is possible. For all sets in self.setlist do the following: Read a set from self.setlist. If self.diclist is empty, add this set. If not, go through every part of self.diclist. Find what is common between the set from self.setlist and the part. Add this to a newlist. Subtract the common part from part, append this bit to the newlist. After having gone through this process, add the rest of the number set to the newlist. Move the newlist to self.diclist. """ print "calculate_sets" for number in self.setlist: if len(self.diclist) == 0: self.diclist.append(number) else: newlist = [] for part in self.diclist: common = part&number if len(common) > 0: newlist.append(common) rest = part - common if len(rest) > 0: newlist.append(rest) if len(number) > 0: number = number - common number = number - rest else: pass if len(number) > 0: newlist.append(number) self.diclist = newlist def calculate_sets2(self): """ Now the interesting bit, mapping the new sets in self.diclist to the sets in self.setlist For each set in self.setlist, swap the old set with a new set, where the new set points to sets in self.diclist """ print "calc sets2" for set_index, number in enumerate(self.setlist): tmp2 = number new_set_list = [] for dic_index, part in enumerate(self.diclist): common = part&number if len(common) > 0: new_set_list.append(dic_index + 1) number = number - common self.new_dic_dict[set_index] = sorted(new_set_list) tmp = set() for x in new_set_list: for y in self.diclist[x - 1]: tmp.add(y) if tmp2 != tmp: print "ulike sett!", set_index print tmp2 print tmp def print_sets(self): """ Print the previously calculated sets as AF lists and SFX's """ print "print_sets" self.aff.write('\n'.join(self.affstart)) for line in self.phonrules: self.aff.write(line) self.aff.write('AF ' + str(len(self.setlist) + len(self.afflist)) + '\n') for x in range(0, len(self.setlist)): self.aff.write("AF ") numberlist = self.new_dic_dict[x] self.aff.write(','.join(map(str, numberlist))) self.aff.write(',65529') self.aff.write(' # ' + str(x + 1) + '\n') for y,item in enumerate(self.afflist): self.aff.write('AF ') self.aff.write(item) self.aff.write(' # ' + str(len(self.setlist) + y + 1) + '\n') self.aff.write('\n') for index, item in enumerate(self.diclist): self.aff.write('SFX ' + str(index + 1) + ' Y ' + str(len(item)) + '\n') for i in item: self.aff.write('SFX ' + str(index + 1) + ' 0 ' + self.sfx_dict[i] + '\n') self.aff.write('\n') def usage(): print "Usage: %s " % os.path.basename(sys.argv[0]) def main(): args = sys.argv[1:] if len(args) != 2: usage() sys.exit(2) sc = set_calculator(args[0], args[1]) sc.read_files() sc.calculate_sets() sc.calculate_sets2() sc.print_sets() if __name__ == "__main__": main()