2021-10-08 22:31:23 +00:00
|
|
|
/*
|
|
|
|
Copyright© 2021 John Sennesael
|
|
|
|
|
|
|
|
UsenetSearch is Free software: you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
UsenetSearch is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
|
2021-10-20 12:18:20 +00:00
|
|
|
#include "usenetsearch/Filter.h"
|
2021-10-08 22:31:23 +00:00
|
|
|
|
|
|
|
#include "usenetsearch/StringUtils.h"
|
|
|
|
|
2021-10-20 12:18:20 +00:00
|
|
|
#include <algorithm>
|
|
|
|
#include <iostream>
|
|
|
|
#include <regex>
|
2021-10-08 22:31:23 +00:00
|
|
|
|
|
|
|
namespace usenetsearch {
|
|
|
|
|
2021-10-12 23:41:03 +00:00
|
|
|
Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
void Filter::Init()
|
|
|
|
{
|
|
|
|
m_noSubtokenWords = m_config.FilterWordsNoSubtoken();
|
|
|
|
const auto eraseTokens = m_config.FilterEraseSubtoken();
|
|
|
|
// Pre-compile regexes for all the subtokens that should be erased.
|
|
|
|
std::for_each(eraseTokens.begin(), eraseTokens.end(),
|
|
|
|
[&](const std::string& tok){
|
2021-10-20 22:48:43 +00:00
|
|
|
const std::wstring wtok = StringToLower(WideStringFromString(tok));
|
2021-10-12 23:41:03 +00:00
|
|
|
m_eraseTokenRegexes.emplace(
|
|
|
|
std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
|
|
|
|
std::wstring{}
|
|
|
|
);
|
|
|
|
m_eraseTokenRegexes.emplace(
|
|
|
|
std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),
|
|
|
|
std::wstring{L""}
|
|
|
|
);
|
|
|
|
m_eraseTokenRegexes.emplace(
|
|
|
|
std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),
|
|
|
|
std::wstring{L" "}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2021-10-19 01:19:11 +00:00
|
|
|
bool Filter::ProcessNewsgroup(const std::string& newsgroup) const
|
2021-10-08 22:31:23 +00:00
|
|
|
{
|
2021-10-19 01:19:11 +00:00
|
|
|
for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())
|
2021-10-08 22:31:23 +00:00
|
|
|
{
|
2021-10-19 01:19:11 +00:00
|
|
|
std::smatch matches;
|
|
|
|
if (std::regex_match(newsgroup, matches, blackRe))
|
|
|
|
{
|
|
|
|
if (matches.size() > 0) return false;
|
|
|
|
}
|
2021-10-08 22:31:23 +00:00
|
|
|
}
|
2021-10-19 01:19:11 +00:00
|
|
|
if (m_config.FilterNewsgroupWhitelist().size() > 0)
|
2021-10-08 22:31:23 +00:00
|
|
|
{
|
2021-10-19 01:19:11 +00:00
|
|
|
for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())
|
|
|
|
{
|
|
|
|
std::smatch matches;
|
|
|
|
if (std::regex_match(newsgroup, matches, whiteRe))
|
|
|
|
{
|
|
|
|
if (matches.size() > 0) return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
2021-10-08 22:31:23 +00:00
|
|
|
}
|
2021-10-19 01:19:11 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string Filter::ProcessSearchString(const std::string& searchString) const
|
|
|
|
{
|
2021-10-20 22:48:43 +00:00
|
|
|
std::wstring str = StringToLower(WideStringFromString(searchString));
|
2021-10-08 22:31:23 +00:00
|
|
|
std::remove_if(str.begin(), str.end(), [](wchar_t c){
|
|
|
|
// Remove control characters.
|
|
|
|
if (c < 0x20) return true; // ascii control chars
|
|
|
|
if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars
|
|
|
|
return false; // don't delete anything else
|
|
|
|
});
|
|
|
|
// Remove punctuation and stuff by converting to whitespace
|
2021-10-20 22:48:43 +00:00
|
|
|
static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\[\\]\"\\-<>]+");
|
2021-10-08 22:31:23 +00:00
|
|
|
str = std::regex_replace(str, rxPunctuation, L" ");
|
2021-10-12 23:41:03 +00:00
|
|
|
// Process erase subtoken list.
|
|
|
|
std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),
|
|
|
|
[&str](const auto& repl){
|
|
|
|
str = std::regex_replace(str, *repl.first, repl.second);
|
|
|
|
}
|
|
|
|
);
|
2021-10-08 22:31:23 +00:00
|
|
|
// Convert repeated whitespace to just one space.
|
|
|
|
static std::wregex rxWhitespaceMerge(L"\\s+");
|
|
|
|
str = std::regex_replace(str, rxWhitespaceMerge, L" ");
|
|
|
|
// Trim the string.
|
|
|
|
str = StringTrim(str);
|
|
|
|
// Convert strings that are ONLY whitespace to blank strings.
|
|
|
|
static std::wregex rxAllWhitespace(L"^\\s+$");
|
|
|
|
str = std::regex_replace(str, rxAllWhitespace, L"");
|
2021-10-19 01:19:11 +00:00
|
|
|
return StringFromWideString(str);
|
2021-10-08 22:31:23 +00:00
|
|
|
}
|
|
|
|
|
2021-10-12 23:41:03 +00:00
|
|
|
std::string Filter::ProcessToken(
|
|
|
|
const std::string& token,
|
2021-10-19 01:19:11 +00:00
|
|
|
const std::string& searchString) const
|
2021-10-12 23:41:03 +00:00
|
|
|
{
|
|
|
|
std::string result = token;
|
|
|
|
// Process the nosubtokens list.
|
|
|
|
if (token != searchString)
|
|
|
|
{
|
|
|
|
if (std::find(
|
|
|
|
m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)
|
|
|
|
!= m_noSubtokenWords.end()) return "";
|
|
|
|
}
|
|
|
|
// Process min subtoken word count.
|
|
|
|
const auto words = StringSplit(result, std::string{" "});
|
|
|
|
const std::uint16_t wordCount = words.size();
|
|
|
|
const std::uint16_t minWords = m_config.MinSubtokenWords();
|
|
|
|
if (minWords > 1)
|
|
|
|
{
|
|
|
|
if ((wordCount < minWords) && (token != searchString))
|
|
|
|
{
|
|
|
|
return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result = token;
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2021-10-08 22:31:23 +00:00
|
|
|
} // namespace usenetsearch
|