UsenetSearch/src/Filter.cpp

/*
    Copyright© 2021 John Sennesael

    UsenetSearch is Free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    UsenetSearch is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with UsenetSearch.  If not, see <https://www.gnu.org/licenses/>.
*/

#include "usenetsearch/Filter.h"

#include "usenetsearch/StringUtils.h"

#include <algorithm>
#include <iostream>
#include <regex>

namespace usenetsearch {

Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}
{
}

void Filter::Init()
{
    m_noSubtokenWords = m_config.FilterWordsNoSubtoken();
    const auto eraseTokens = m_config.FilterEraseSubtoken();
    // Pre-compile regexes for all the subtokens that should be erased.
    std::for_each(eraseTokens.begin(), eraseTokens.end(),
        [&](const std::string& tok){
            const std::wstring wtok = StringToLower(WideStringFromString(tok));
            m_eraseTokenRegexes.emplace(
                std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
                std::wstring{}
            );
            m_eraseTokenRegexes.emplace(
                std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),
                std::wstring{L""}
            );
            m_eraseTokenRegexes.emplace(
                std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),
                std::wstring{L" "}
            );
        }
    );
}

bool Filter::ProcessNewsgroup(const std::string& newsgroup) const
{
    for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())
    {
        std::smatch matches;
        if (std::regex_match(newsgroup, matches, blackRe))
        {
            if (matches.size() > 0) return false;
        }
    }
    if (m_config.FilterNewsgroupWhitelist().size() > 0)
    {
        for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())
        {
            std::smatch matches;
            if (std::regex_match(newsgroup, matches, whiteRe))
            {
                if (matches.size() > 0) return true;
            }
        }
        return false;
    }
    return true;
}

std::string Filter::ProcessSearchString(const std::string& searchString) const
{
    std::wstring str = StringToLower(WideStringFromString(searchString));
    std::remove_if(str.begin(), str.end(), [](wchar_t c){
        // Remove control characters.
        if (c < 0x20) return true; // ascii control chars
        if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars
        return false; // don't delete anything else
    });
    // Remove punctuation and stuff by converting to whitespace
    static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\[\\]\"\\-<>]+");
    str = std::regex_replace(str, rxPunctuation, L" ");
    // Process erase subtoken list.
    std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),
        [&str](const auto& repl){
            str = std::regex_replace(str, *repl.first, repl.second);
        }
    );
    // Convert repeated whitespace to just one space.
    static std::wregex rxWhitespaceMerge(L"\\s+");
    str = std::regex_replace(str, rxWhitespaceMerge, L" ");
    // Trim the string.
    str = StringTrim(str);
    // Convert strings that are ONLY whitespace to blank strings.
    static std::wregex rxAllWhitespace(L"^\\s+$");
    str = std::regex_replace(str, rxAllWhitespace, L"");
    return StringFromWideString(str);
}

std::string Filter::ProcessToken(
    const std::string& token,
    const std::string& searchString) const
{
    std::string result = token;
    // Process the nosubtokens list.
    if (token != searchString)
    {
        if (std::find(
                m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)
            != m_noSubtokenWords.end()) return "";
    }
    // Process min subtoken word count.
    const auto words = StringSplit(result, std::string{" "});
    const std::uint16_t wordCount = words.size();
    const std::uint16_t minWords = m_config.MinSubtokenWords();
    if (minWords > 1)
    {
        if ((wordCount < minWords) && (token != searchString))
        {
            return "";
        }
    }
    result = token;
    return result;
}

} // namespace usenetsearch