/* Copyright (C) 2012 Børre Gaup This file is part of the program wordlist2hunspell. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * SetOptimizer compacts the map between suffixes and stems that is collected by WordlistParser. * SetOptimizer produces two things that are later used by HunspellPrinter, * _suffixMap and newSuffixesStemsMap */ #include "setoptimizer.h" #include // for the sort function #include SetOptimizer::SetOptimizer(std::map< std::set< int >, std::set< int > > suffixSetStems): _origSuffixSetStems(suffixSetStems) { mapSubSets(); } /*! * Extracts the original sets of suffixes collected by WordlistParser */ std::set< std::set< int > > SetOptimizer::getSuffixSetsFromStems() { std::set< std::set > suffixSetSets; for (std::map< std::set, std::set >::iterator it = _origSuffixSetStems.begin(); it != _origSuffixSetStems.end(); it++) { suffixSetSets.insert(it->first); } return suffixSetSets; } /* * The original suffix sets are converted into a new suffix sets. * The original suffix sets contains sets where the same integer appears in * several sets, whereas the new suffixsets only have unique integers. */ std::set< std::set< int > > SetOptimizer::splitSuffixSetsIntoSubSets() { std::set< std::set > suffixSetSets = getSuffixSetsFromStems(); std::set< std::set >::iterator it = suffixSetSets.begin(); std::set< std::set > suffixSubSets; suffixSubSets.insert((*it)); ++it; int onePercent = suffixSetSets.size() / 100; if (onePercent == 0) { onePercent = 1; } int counter = 0; for (; it != suffixSetSets.end(); ++it, counter++) { // iterate through the members of affixSetVector std::set originalSet = (*it); std::set< std::set > newSuffixSubSets; for (std::set< std::set >::iterator affixSet = suffixSubSets.begin(); affixSet != suffixSubSets.end(); affixSet++) { // iterate through the affixSubSets vector std::set intersection; //= intersection(*it2, newSet); std::set_intersection(affixSet->begin(), affixSet->end(), originalSet.begin(), originalSet.end(), std::inserter(intersection, intersection.begin())); if (!intersection.empty()) { newSuffixSubSets.insert(intersection); } std::set difference1; //= symmetricDifference(*it2, set1); std::set_difference(affixSet->begin(), affixSet->end(), intersection.begin(), intersection.end(), std::inserter(difference1, difference1.begin())); if (!difference1.empty()) { newSuffixSubSets.insert(difference1); } std::set difference2; //= symmetricDifference( newSet, set1); std::set_difference(originalSet.begin(), originalSet.end(), intersection.begin(), intersection.end(), std::inserter(difference2, difference2.begin())); originalSet = difference2; } if (!originalSet.empty()) { newSuffixSubSets.insert(originalSet); } suffixSubSets = newSuffixSubSets; if ((counter % onePercent) == 0) { std::cout << "/"; } } std::cout << std::endl << "size after subsetting: " << suffixSubSets.size() << std::endl << std::endl;; return suffixSubSets; } /* * Inserts the new suffix sets into a map */ void SetOptimizer::mapSubSets() { std::set< std::set< int > > suffixSubSets = splitSuffixSetsIntoSubSets(); int affixIndex = 1; for(std::set< std::set >::iterator it = suffixSubSets.begin(); it != suffixSubSets.end(); ++it) { _suffixMap[affixIndex] = *it; affixIndex++; } } std::map< int, std::set< int > > SetOptimizer::getSuffixMap() const { return _suffixMap; } /* * Maps the new suffix sets to the stems */ std::map, std::set > SetOptimizer::giveSubsetsToStems() { std::map > subSetMap = getSuffixMap(); std::map< std::set, std::set > newSuffixesStemsMap; int c = 0; for (std::map< std::set< int >, std::set< int > >::iterator it = _origSuffixSetStems.begin(); it != _origSuffixSetStems.end(); it++, c++) { std::set< int > originalSet = it->first; std::set< int > newSet; for (Suffixes::iterator it2 = subSetMap.begin(); it2 != subSetMap.end(); ++it2) { if (includes(originalSet.begin(), originalSet.end(), it2->second.begin(), it2->second.end()) == true ) { newSet.insert(it2->first); } } if (!newSet.empty()) { newSuffixesStemsMap[newSet] = it->second; } } _origSuffixSetStems.clear(); return newSuffixesStemsMap; }