Filter class

This commit is contained in:
John Sennesael 2021-10-08 17:31:23 -05:00
parent 7f8a745858
commit e7619f5236
5 changed files with 113 additions and 18 deletions

View File

@ -40,6 +40,7 @@ add_library(usenetsearch
"src/Database.cpp"
"src/Dns.cpp"
"src/Except.cpp"
"src/Filter.cpp"
"src/IoSocket.cpp"
"src/Serialize.cpp"
"src/SSLConnection.cpp"

View File

@ -0,0 +1,37 @@
/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#pragma once
#include <codecvt>
#include <locale>
#include <string>
namespace usenetsearch {
class Filter
{
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
public:
std::string ProcessSearchString(const std::string& searchString);
};
} // namespace usenetsearch

View File

@ -21,6 +21,7 @@
#include <locale>
#include "usenetsearch/Application.h"
#include "usenetsearch/Filter.h"
#include "usenetsearch/UsenetClient.h"
#include "usenetsearch/ThreadPool.h"
@ -31,6 +32,7 @@ class Indexer
Application& m_app;
UsenetClient& m_client;
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
Filter m_filter;
ThreadPool m_threads;
public:

69
src/Filter.cpp Normal file
View File

@ -0,0 +1,69 @@
/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include <algorithm>
#include <regex>
#include "usenetsearch/StringUtils.h"
#include "usenetsearch/Filter.h"
namespace usenetsearch {
std::string Filter::ProcessSearchString(const std::string& searchString)
{
std::wstring str;
try
{
str = m_conv.from_bytes(searchString);
}
catch (const std::range_error&)
{
return ""; // string is not valid utf8
}
std::remove_if(str.begin(), str.end(), [](wchar_t c){
// Remove control characters.
if (c < 0x20) return true; // ascii control chars
if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars
return false; // don't delete anything else
});
// Remove Re: for obvious reasons
str = StringRemove(StringToLower(str), std::wstring{L"re:"});
// Remove punctuation and stuff by converting to whitespace
static std::wregex rxPunctuation(L"[\\.!?#$%^&~*()+\\[\\]\"-<>]+");
str = std::regex_replace(str, rxPunctuation, L" ");
// Convert repeated whitespace to just one space.
static std::wregex rxWhitespaceMerge(L"\\s+");
str = std::regex_replace(str, rxWhitespaceMerge, L" ");
// Trim the string.
str = StringTrim(str);
// Convert strings that are ONLY whitespace to blank strings.
static std::wregex rxAllWhitespace(L"^\\s+$");
str = std::regex_replace(str, rxAllWhitespace, L"");
std::string result;
try
{
result = m_conv.to_bytes(str);
}
catch (const std::range_error&)
{
return "";
}
return result;
}
} // namespace usenetsearch

View File

@ -46,8 +46,6 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
{
/**
* @todo Replace all stdout stuff with Logger class.
* @todo All the filtering (subject.erase bits and such) need to move to a
* separate class.
*/
const size_t batchSize = m_app.Config().BatchSize();
for (const auto& group: newsgroups)
@ -64,25 +62,13 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
std::reference_wrapper<Database> dbref = std::ref(m_app.Db());
m_client.ProcessHeaders(0,
[this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
m_threads.Queue([headers, &headerCount, &dbref](){
m_threads.Queue([this, headers, &headerCount, &dbref](){
for (const auto& header: *headers)
{
const std::uint64_t id{header.articleID};
std::string subject = StringRemove(
StringToLower(header.subject), std::string{"re:"}
);
subject.erase(
std::remove_if(
subject.begin(), subject.end(),
[](char c){
if (std::isspace(c)) return false;
if ((c > 65) && (c < 90)) return false;
if ((c > 97) && (c < 122)) return false;
if (c == '\'') return false;
if ((c > 48) && (c < 57)) return false;
return true;
}), subject.end()
);
std::string subject = header.subject;
subject = m_filter.ProcessSearchString(subject);
if (subject == "") continue;
dbref.get().SaveSearchTokens(1, id, subject);
headerCount++;
}