Filter class
This commit is contained in:
parent
7f8a745858
commit
e7619f5236
|
@ -40,6 +40,7 @@ add_library(usenetsearch
|
|||
"src/Database.cpp"
|
||||
"src/Dns.cpp"
|
||||
"src/Except.cpp"
|
||||
"src/Filter.cpp"
|
||||
"src/IoSocket.cpp"
|
||||
"src/Serialize.cpp"
|
||||
"src/SSLConnection.cpp"
|
||||
|
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
Copyright© 2021 John Sennesael
|
||||
|
||||
UsenetSearch is Free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
UsenetSearch is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <codecvt>
|
||||
#include <locale>
|
||||
#include <string>
|
||||
|
||||
namespace usenetsearch {
|
||||
|
||||
class Filter
|
||||
{
|
||||
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
|
||||
|
||||
public:
|
||||
|
||||
std::string ProcessSearchString(const std::string& searchString);
|
||||
|
||||
};
|
||||
|
||||
} // namespace usenetsearch
|
|
@ -21,6 +21,7 @@
|
|||
#include <locale>
|
||||
|
||||
#include "usenetsearch/Application.h"
|
||||
#include "usenetsearch/Filter.h"
|
||||
#include "usenetsearch/UsenetClient.h"
|
||||
#include "usenetsearch/ThreadPool.h"
|
||||
|
||||
|
@ -31,6 +32,7 @@ class Indexer
|
|||
Application& m_app;
|
||||
UsenetClient& m_client;
|
||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
|
||||
Filter m_filter;
|
||||
ThreadPool m_threads;
|
||||
|
||||
public:
|
||||
|
|
|
@ -0,0 +1,69 @@
|
|||
/*
|
||||
Copyright© 2021 John Sennesael
|
||||
|
||||
UsenetSearch is Free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
UsenetSearch is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <regex>
|
||||
|
||||
#include "usenetsearch/StringUtils.h"
|
||||
|
||||
#include "usenetsearch/Filter.h"
|
||||
|
||||
namespace usenetsearch {
|
||||
|
||||
std::string Filter::ProcessSearchString(const std::string& searchString)
|
||||
{
|
||||
std::wstring str;
|
||||
try
|
||||
{
|
||||
str = m_conv.from_bytes(searchString);
|
||||
}
|
||||
catch (const std::range_error&)
|
||||
{
|
||||
return ""; // string is not valid utf8
|
||||
}
|
||||
std::remove_if(str.begin(), str.end(), [](wchar_t c){
|
||||
// Remove control characters.
|
||||
if (c < 0x20) return true; // ascii control chars
|
||||
if ((c > 0x7e) && (c < 0xa0)) return true; // utf8 control chars
|
||||
return false; // don't delete anything else
|
||||
});
|
||||
// Remove Re: for obvious reasons
|
||||
str = StringRemove(StringToLower(str), std::wstring{L"re:"});
|
||||
// Remove punctuation and stuff by converting to whitespace
|
||||
static std::wregex rxPunctuation(L"[\\.!?#$%^&~*()+\\[\\]\"-<>]+");
|
||||
str = std::regex_replace(str, rxPunctuation, L" ");
|
||||
// Convert repeated whitespace to just one space.
|
||||
static std::wregex rxWhitespaceMerge(L"\\s+");
|
||||
str = std::regex_replace(str, rxWhitespaceMerge, L" ");
|
||||
// Trim the string.
|
||||
str = StringTrim(str);
|
||||
// Convert strings that are ONLY whitespace to blank strings.
|
||||
static std::wregex rxAllWhitespace(L"^\\s+$");
|
||||
str = std::regex_replace(str, rxAllWhitespace, L"");
|
||||
std::string result;
|
||||
try
|
||||
{
|
||||
result = m_conv.to_bytes(str);
|
||||
}
|
||||
catch (const std::range_error&)
|
||||
{
|
||||
return "";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace usenetsearch
|
|
@ -46,8 +46,6 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
|
|||
{
|
||||
/**
|
||||
* @todo Replace all stdout stuff with Logger class.
|
||||
* @todo All the filtering (subject.erase bits and such) need to move to a
|
||||
* separate class.
|
||||
*/
|
||||
const size_t batchSize = m_app.Config().BatchSize();
|
||||
for (const auto& group: newsgroups)
|
||||
|
@ -64,25 +62,13 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
|
|||
std::reference_wrapper<Database> dbref = std::ref(m_app.Db());
|
||||
m_client.ProcessHeaders(0,
|
||||
[this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
|
||||
m_threads.Queue([headers, &headerCount, &dbref](){
|
||||
m_threads.Queue([this, headers, &headerCount, &dbref](){
|
||||
for (const auto& header: *headers)
|
||||
{
|
||||
const std::uint64_t id{header.articleID};
|
||||
std::string subject = StringRemove(
|
||||
StringToLower(header.subject), std::string{"re:"}
|
||||
);
|
||||
subject.erase(
|
||||
std::remove_if(
|
||||
subject.begin(), subject.end(),
|
||||
[](char c){
|
||||
if (std::isspace(c)) return false;
|
||||
if ((c > 65) && (c < 90)) return false;
|
||||
if ((c > 97) && (c < 122)) return false;
|
||||
if (c == '\'') return false;
|
||||
if ((c > 48) && (c < 57)) return false;
|
||||
return true;
|
||||
}), subject.end()
|
||||
);
|
||||
std::string subject = header.subject;
|
||||
subject = m_filter.ProcessSearchString(subject);
|
||||
if (subject == "") continue;
|
||||
dbref.get().SaveSearchTokens(1, id, subject);
|
||||
headerCount++;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue