/* Copyright (C) 2008-2013 Børre Gaup This file is part of the program wordlist2hunspell. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ #include // needed for output #include // needed for exit #include // needed for sort #include // needed for converting int to string #include "wordlistparser.h" /*! * Each unique stem is mapped to a unique number * Each unique suffix is mapped to a unique number * Each unique derivation is mapped to a unique number * * This mapping is done because the input file can be huge, so instead * of storing the strings representing stems, suffixes and derivations, each * string is assigned a number. * * Each stem has a set of suffixes and derivations. Because different stems * may have the same set of suffixes and derivations, this is recorded in * maps. * * The _suffixSetStemSet maps a suffixset to a list of stems * The _derivationSetStemSet maps a derivationset to a list of stems * */ WordlistParser::WordlistParser(): _stemid(1), _id(1), _lineCount(1), _plxClasses() { } void WordlistParser::addPlxClass(const std::string suffix) { std::string::size_type tab = suffix.find("\t"); _plxClasses.insert(suffix.substr(tab + 1, suffix.size() - tab + 1)); } /** * @brief Adds \a suffix to \a _suffixMap if it isn't found * * Updates \a _currentSuffixes with the id of the suffix * * @param suffix a suffix * @return void */ void WordlistParser::addSuffix(std::string suffix) { int suffixid = _suffixMap[suffix]; if (suffixid == 0) { _suffixMap[suffix] = _id; suffixid = _id; _id++; } _currentSuffixes.insert(suffixid); } void WordlistParser::writeStatistics() { std::cerr << _lineCount << " lines from the sorted file lead to" << std::endl; std::cerr << _stemid << " stems" << std::endl; std::cerr << _suffixMap.size() << " suffixes" << std::endl; } /** * @brief Adds a stem to \a _stemMap if it isn't found * * @return void */ void WordlistParser::addStem() { replaceSlash(); int stemid = _stemMap[getPreviousStem()]; if (stemid == 0) { _stemMap[getPreviousStem()] = _stemid; stemid = _stemid; _stemid++; } updateStemSuffixMap(stemid); } /** * @brief Updates \a _stemSuffixMap with the recorded suffixes in \a _currentSuffixes * * _currentSuffixes is cleared so that it is ready to record new suffixes for the next stem * * @param stemid points to the real stem in \_stemMap * @return void */ void WordlistParser::updateStemSuffixMap(int stemid) { _stemSuffixMap[stemid].insert(_currentSuffixes.begin(), _currentSuffixes.end()); _currentSuffixes.clear(); } void WordlistParser::addSuffixes(int stemid) { if (! _suffixSet.empty() ) { _suffixSetStemSet[_suffixSet].insert(stemid); } _suffixSet.clear(); } void WordlistParser::setPreviousStem(std::string stem) { _previousStem = stem; } std::string WordlistParser::getPreviousStem() const { return _previousStem; } void WordlistParser::replaceSlash() { std::string::size_type slash = _previousStem.find("/"); if (slash != std::string::npos) { _previousStem.replace(slash, 1, "\\/"); } } /** * @brief Adds \a _previousStem to \a _stemMap if necessary * * The bool stemWithLength1HasOnlyOneEmptySuffix is needed to protect against * stems with these features. They will give one letter suggestion, which are * not wanted. * * @param currentStem stem of the line * @param suffixes suffixes of the line * @return void */ void WordlistParser::handleStem(std::string currentStem, std::string suffix) { bool stemWithLength1HasEmptySuffix = (currentStem.length() == 1 && suffix == "0"); if (currentStem != getPreviousStem() && !stemWithLength1HasEmptySuffix) { addStem(); setPreviousStem(currentStem); updateSuffixsetStemsetMap(); } } /** * @brief Gets plxParts from the given string \a plxPart * * @param plxPart the plxpart of the inString * @return std::set */ std::set WordlistParser::getPlxParts(std::string plxPart) { std::string::size_type t; std::set plxParts; while((t = plxPart.find(",")) != std::string::npos) { plxParts.insert(plxPart.substr(0, t)); plxPart = plxPart.substr(t + 1, plxPart.size()); } plxParts.insert(plxPart); return plxParts; } /** * @brief Updates and prints linecount * * @return void */ void WordlistParser::updateAndPrintLineCount() { _lineCount++; if ((_lineCount % 100000) == 0) { std::cerr << "."; } } /** * @brief Adds the content of \a _stemSuffixMap to \a _suffixSetStemSet, then clears \a _stemSuffixMap * * @return void */ void WordlistParser::updateSuffixsetStemsetMap() { if (_firstChar != _previousStem.substr(0, 1)) { for (std::map >::iterator it = _stemSuffixMap.begin(); it != _stemSuffixMap.end(); ++it) { _suffixSetStemSet[it->second].insert(it->first); } _stemSuffixMap.clear(); _firstChar = _previousStem.substr(0, 1); } } WordlistParser::~WordlistParser() { } SuffixSetStemSetMap WordlistParser::getSuffixSetStemSet() const { return _suffixSetStemSet; } SuffixMap WordlistParser::getSuffixMap() const { return _suffixMap; } std::map WordlistParser::getStems() const { return _stemMap; } std::set WordlistParser::getPlxClasses() const { return _plxClasses; } void WordlistParser::setFirstChar() { _firstChar = _previousStem.substr(0, 1); }