/* Copyright© 2021 John Sennesael UsenetSearch is Free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. UsenetSearch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with UsenetSearch. If not, see . */ #include "usenetsearch/Filter.h" #include "usenetsearch/StringUtils.h" #include #include #include namespace usenetsearch { Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{} { } void Filter::Init() { m_noSubtokenWords = m_config.FilterWordsNoSubtoken(); const auto eraseTokens = m_config.FilterEraseSubtoken(); // Pre-compile regexes for all the subtokens that should be erased. std::for_each(eraseTokens.begin(), eraseTokens.end(), [&](const std::string& tok){ const std::wstring wtok = StringToLower(WideStringFromString(tok)); m_eraseTokenRegexes.emplace( std::make_unique(L"^" + wtok + L"\\s+"), std::wstring{} ); m_eraseTokenRegexes.emplace( std::make_unique(L"\\s+" + wtok + L"$"), std::wstring{L""} ); m_eraseTokenRegexes.emplace( std::make_unique(L"\\s+" + wtok + L"\\s+"), std::wstring{L" "} ); } ); } bool Filter::ProcessNewsgroup(const std::string& newsgroup) const { for (const auto& blackRe: m_config.FilterNewsgroupBlacklist()) { std::smatch matches; if (std::regex_match(newsgroup, matches, blackRe)) { if (matches.size() > 0) return false; } } if (m_config.FilterNewsgroupWhitelist().size() > 0) { for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist()) { std::smatch matches; if (std::regex_match(newsgroup, matches, whiteRe)) { if (matches.size() > 0) return true; } } return false; } return true; } std::string Filter::ProcessSearchString(const std::string& searchString) const { std::wstring str = StringToLower(WideStringFromString(searchString)); std::remove_if(str.begin(), str.end(), [](wchar_t c){ // Remove control characters. if (c < 0x20) return true; // ascii control chars if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars return false; // don't delete anything else }); // Remove punctuation and stuff by converting to whitespace static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\$\$\\[\\]\"\\-<>]+"); str = std::regex_replace(str, rxPunctuation, L" "); // Process erase subtoken list. std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(), [&str](const auto& repl){ str = std::regex_replace(str, *repl.first, repl.second); } ); // Convert repeated whitespace to just one space. static std::wregex rxWhitespaceMerge(L"\\s+"); str = std::regex_replace(str, rxWhitespaceMerge, L" "); // Trim the string. str = StringTrim(str); // Convert strings that are ONLY whitespace to blank strings. static std::wregex rxAllWhitespace(L"^\\s+$"); str = std::regex_replace(str, rxAllWhitespace, L""); return StringFromWideString(str); } std::string Filter::ProcessToken( const std::string& token, const std::string& searchString) const { std::string result = token; // Process the nosubtokens list. if (token != searchString) { if (std::find( m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result) != m_noSubtokenWords.end()) return ""; } // Process min subtoken word count. const auto words = StringSplit(result, std::string{" "}); const std::uint16_t wordCount = words.size(); const std::uint16_t minWords = m_config.MinSubtokenWords(); if (minWords > 1) { if ((wordCount < minWords) && (token != searchString)) { return ""; } } result = token; return result; } } // namespace usenetsearch