UsenetSearch/src/Filter.cpp

/*
    Copyright© 2021 John Sennesael

    UsenetSearch is Free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    UsenetSearch is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with UsenetSearch.  If not, see <https://www.gnu.org/licenses/>.
*/

#include "usenetsearch/Filter.h"

#include "usenetsearch/StringUtils.h"

#include <algorithm>
#include <iostream>
#include <regex>

namespace usenetsearch {

Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}
{
}

void Filter::Init()
{
    m_noSubtokenWords = m_config.FilterWordsNoSubtoken();
    const auto eraseTokens = m_config.FilterEraseSubtoken();
    // Pre-compile regexes for all the subtokens that should be erased.
    std::for_each(eraseTokens.begin(), eraseTokens.end(),
        [&](const std::string& tok){
            const std::wstring wtok = StringToLower(WideStringFromString(tok));
            m_eraseTokenRegexes.emplace(
                std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
                std::wstring{}
            );
            m_eraseTokenRegexes.emplace(
                std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),
                std::wstring{L""}
            );
            m_eraseTokenRegexes.emplace(
                std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),
                std::wstring{L" "}
            );
        }
    );
}

bool Filter::ProcessNewsgroup(const std::string& newsgroup) const
{
    for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())
    {
        std::smatch matches;
        if (std::regex_match(newsgroup, matches, blackRe))
        {
            if (matches.size() > 0) return false;
        }
    }
    if (m_config.FilterNewsgroupWhitelist().size() > 0)
    {
        for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())
        {
            std::smatch matches;
            if (std::regex_match(newsgroup, matches, whiteRe))
            {
                if (matches.size() > 0) return true;
            }
        }
        return false;
    }
    return true;
}

std::string Filter::ProcessSearchString(const std::string& searchString) const
{
    std::wstring str = StringToLower(WideStringFromString(searchString));
    std::remove_if(str.begin(), str.end(), [](wchar_t c){
        // Remove control characters.
        if (c < 0x20) return true; // ascii control chars
        if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars
        return false; // don't delete anything else
    });
    // Remove punctuation and stuff by converting to whitespace
    static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\[\\]\"\\-<>]+");
    str = std::regex_replace(str, rxPunctuation, L" ");
    // Process erase subtoken list.
    std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),
        [&str](const auto& repl){
            str = std::regex_replace(str, *repl.first, repl.second);
        }
    );
    // Convert repeated whitespace to just one space.
    static std::wregex rxWhitespaceMerge(L"\\s+");
    str = std::regex_replace(str, rxWhitespaceMerge, L" ");
    // Trim the string.
    str = StringTrim(str);
    // Convert strings that are ONLY whitespace to blank strings.
    static std::wregex rxAllWhitespace(L"^\\s+$");
    str = std::regex_replace(str, rxAllWhitespace, L"");
    return StringFromWideString(str); 
}

std::string Filter::ProcessToken(
    const std::string& token,
    const std::string& searchString) const
{
    std::string result = token;
    // Process the nosubtokens list.
    if (token != searchString)
    {
        if (std::find(
                m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)
            != m_noSubtokenWords.end()) return "";
    }
    // Process min subtoken word count.
    const auto words = StringSplit(result, std::string{" "});
    const std::uint16_t wordCount = words.size();
    const std::uint16_t minWords = m_config.MinSubtokenWords();
    if (minWords > 1)
    {
        if ((wordCount < minWords) && (token != searchString))
        {
            return "";
        }
    }
    result = token;
    return result;
}

} // namespace usenetsearch
Filter class 2021-10-08 22:31:23 +00:00			`/*`
			`Copyright© 2021 John Sennesael`

			`UsenetSearch is Free software: you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation, either version 3 of the License, or`
			`(at your option) any later version.`

			`UsenetSearch is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.`
			`*/`

Add logger class, fix include orders 2021-10-20 12:18:20 +00:00			`#include "usenetsearch/Filter.h"`
Filter class 2021-10-08 22:31:23 +00:00
			`#include "usenetsearch/StringUtils.h"`

Add logger class, fix include orders 2021-10-20 12:18:20 +00:00			`#include <algorithm>`
			`#include <iostream>`
			`#include <regex>`
Filter class 2021-10-08 22:31:23 +00:00
			`namespace usenetsearch {`

Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}`
			`{`
			`}`

			`void Filter::Init()`
			`{`
			`m_noSubtokenWords = m_config.FilterWordsNoSubtoken();`
			`const auto eraseTokens = m_config.FilterEraseSubtoken();`
			`// Pre-compile regexes for all the subtokens that should be erased.`
			`std::for_each(eraseTokens.begin(), eraseTokens.end(),`
			`[&](const std::string& tok){`
Actual search results! Safer cleanup when interrupted. Consistent include order. And a bunch of other polish 2021-10-20 22:48:43 +00:00			`const std::wstring wtok = StringToLower(WideStringFromString(tok));`
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`m_eraseTokenRegexes.emplace(`
			`std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),`
			`std::wstring{}`
			`);`
			`m_eraseTokenRegexes.emplace(`
			`std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),`
			`std::wstring{L""}`
			`);`
			`m_eraseTokenRegexes.emplace(`
			`std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),`
			`std::wstring{L" "}`
			`);`
			`}`
			`);`
			`}`

Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`bool Filter::ProcessNewsgroup(const std::string& newsgroup) const`
Filter class 2021-10-08 22:31:23 +00:00			`{`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())`
Filter class 2021-10-08 22:31:23 +00:00			`{`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`std::smatch matches;`
			`if (std::regex_match(newsgroup, matches, blackRe))`
			`{`
			`if (matches.size() > 0) return false;`
			`}`
Filter class 2021-10-08 22:31:23 +00:00			`}`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`if (m_config.FilterNewsgroupWhitelist().size() > 0)`
Filter class 2021-10-08 22:31:23 +00:00			`{`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())`
			`{`
			`std::smatch matches;`
			`if (std::regex_match(newsgroup, matches, whiteRe))`
			`{`
			`if (matches.size() > 0) return true;`
			`}`
			`}`
			`return false;`
Filter class 2021-10-08 22:31:23 +00:00			`}`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`return true;`
			`}`

			`std::string Filter::ProcessSearchString(const std::string& searchString) const`
			`{`
Actual search results! Safer cleanup when interrupted. Consistent include order. And a bunch of other polish 2021-10-20 22:48:43 +00:00			`std::wstring str = StringToLower(WideStringFromString(searchString));`
Filter class 2021-10-08 22:31:23 +00:00			`std::remove_if(str.begin(), str.end(), [](wchar_t c){`
			`// Remove control characters.`
			`if (c < 0x20) return true; // ascii control chars`
			`if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars`
			`return false; // don't delete anything else`
			`});`
			`// Remove punctuation and stuff by converting to whitespace`
Actual search results! Safer cleanup when interrupted. Consistent include order. And a bunch of other polish 2021-10-20 22:48:43 +00:00			`static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\[\\]\"\\-<>]+");`
Filter class 2021-10-08 22:31:23 +00:00			`str = std::regex_replace(str, rxPunctuation, L" ");`
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`// Process erase subtoken list.`
			`std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),`
			`[&str](const auto& repl){`
			`str = std::regex_replace(str, *repl.first, repl.second);`
			`}`
			`);`
Filter class 2021-10-08 22:31:23 +00:00			`// Convert repeated whitespace to just one space.`
			`static std::wregex rxWhitespaceMerge(L"\\s+");`
			`str = std::regex_replace(str, rxWhitespaceMerge, L" ");`
			`// Trim the string.`
			`str = StringTrim(str);`
			`// Convert strings that are ONLY whitespace to blank strings.`
			`static std::wregex rxAllWhitespace(L"^\\s+$");`
			`str = std::regex_replace(str, rxAllWhitespace, L"");`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`return StringFromWideString(str);`
Filter class 2021-10-08 22:31:23 +00:00			`}`

Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`std::string Filter::ProcessToken(`
			`const std::string& token,`
Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) 2021-10-19 01:19:11 +00:00			`const std::string& searchString) const`
Implement various filter options, some bugfixes. 2021-10-12 23:41:03 +00:00			`{`
			`std::string result = token;`
			`// Process the nosubtokens list.`
			`if (token != searchString)`
			`{`
			`if (std::find(`
			`m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)`
			`!= m_noSubtokenWords.end()) return "";`
			`}`
			`// Process min subtoken word count.`
			`const auto words = StringSplit(result, std::string{" "});`
			`const std::uint16_t wordCount = words.size();`
			`const std::uint16_t minWords = m_config.MinSubtokenWords();`
			`if (minWords > 1)`
			`{`
			`if ((wordCount < minWords) && (token != searchString))`
			`{`
			`return "";`
			`}`
			`}`
			`result = token;`
			`return result;`
			`}`

Filter class 2021-10-08 22:31:23 +00:00			`} // namespace usenetsearch`