UsenetSearch/src/Filter.cpp

138 lines
4.5 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include "usenetsearch/Filter.h"
#include "usenetsearch/StringUtils.h"
#include <algorithm>
#include <iostream>
#include <regex>
namespace usenetsearch {
Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}
{
}
void Filter::Init()
{
m_noSubtokenWords = m_config.FilterWordsNoSubtoken();
const auto eraseTokens = m_config.FilterEraseSubtoken();
// Pre-compile regexes for all the subtokens that should be erased.
std::for_each(eraseTokens.begin(), eraseTokens.end(),
[&](const std::string& tok){
const std::wstring wtok = StringToLower(WideStringFromString(tok));
m_eraseTokenRegexes.emplace(
std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
std::wstring{}
);
m_eraseTokenRegexes.emplace(
std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),
std::wstring{L""}
);
m_eraseTokenRegexes.emplace(
std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),
std::wstring{L" "}
);
}
);
}
bool Filter::ProcessNewsgroup(const std::string& newsgroup) const
{
for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())
{
std::smatch matches;
if (std::regex_match(newsgroup, matches, blackRe))
{
if (matches.size() > 0) return false;
}
}
if (m_config.FilterNewsgroupWhitelist().size() > 0)
{
for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())
{
std::smatch matches;
if (std::regex_match(newsgroup, matches, whiteRe))
{
if (matches.size() > 0) return true;
}
}
return false;
}
return true;
}
std::string Filter::ProcessSearchString(const std::string& searchString) const
{
std::wstring str = StringToLower(WideStringFromString(searchString));
std::remove_if(str.begin(), str.end(), [](wchar_t c){
// Remove control characters.
if (c < 0x20) return true; // ascii control chars
if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars
return false; // don't delete anything else
});
// Remove punctuation and stuff by converting to whitespace
static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\[\\]\"\\-<>]+");
str = std::regex_replace(str, rxPunctuation, L" ");
// Process erase subtoken list.
std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),
[&str](const auto& repl){
str = std::regex_replace(str, *repl.first, repl.second);
}
);
// Convert repeated whitespace to just one space.
static std::wregex rxWhitespaceMerge(L"\\s+");
str = std::regex_replace(str, rxWhitespaceMerge, L" ");
// Trim the string.
str = StringTrim(str);
// Convert strings that are ONLY whitespace to blank strings.
static std::wregex rxAllWhitespace(L"^\\s+$");
str = std::regex_replace(str, rxAllWhitespace, L"");
return StringFromWideString(str);
}
std::string Filter::ProcessToken(
const std::string& token,
const std::string& searchString) const
{
std::string result = token;
// Process the nosubtokens list.
if (token != searchString)
{
if (std::find(
m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)
!= m_noSubtokenWords.end()) return "";
}
// Process min subtoken word count.
const auto words = StringSplit(result, std::string{" "});
const std::uint16_t wordCount = words.size();
const std::uint16_t minWords = m_config.MinSubtokenWords();
if (minWords > 1)
{
if ((wordCount < minWords) && (token != searchString))
{
return "";
}
}
result = token;
return result;
}
} // namespace usenetsearch