Implement various filter options, some bugfixes.

This commit is contained in:
John Sennesael 2021-10-12 18:41:03 -05:00
parent e7619f5236
commit 8b2ac61264
15 changed files with 377 additions and 49 deletions

View File

@ -23,6 +23,7 @@
#include "usenetsearch/Configuration.h"
#include "usenetsearch/Database.h"
#include "usenetsearch/Filter.h"
namespace usenetsearch {
@ -54,13 +55,13 @@ class Application
Configuration m_config;
std::string m_configFile{"usenetsearch.conf"};
Database m_db;
Filter m_filter;
void ExecuteCustomOption(
const std::shared_ptr<CommandLineOption>&,
std::shared_ptr<CommandLineOption>&,
const std::string& value=""
);
void ParseArgs(int argc, char* argv[]);
void Usage(const std::string& programName);
public:
@ -77,9 +78,17 @@ public:
std::function<void(std::filesystem::path)> onParse,
std::filesystem::path defaultValue = "."
);
Configuration& Config();
Database& Db();
void AddStringOption(
char option,
const std::string& help,
std::function<void(std::string)> onParse,
std::string defaultValue = ""
);
Configuration& GetConfig();
Database& GetDb();
Filter& GetFilter();
void Init(int argc, char* argv[]);
void Usage(const std::string& programName);
};
} // namespace usenetsearch

View File

@ -35,8 +35,11 @@ struct ConfigurationException: public UsenetSearchException
class Configuration
{
std::uint16_t m_batchSize{1};
std::vector<std::string> m_filterEraseSubtoken;
std::vector<std::string> m_filterWordsNoSubtoken;
std::uint16_t m_maxThreads{1};
std::uint8_t m_maxTreeDepth{5};
std::uint16_t m_minSubtokenWords{1};
std::string m_nntpServerHost{"127.0.0.1"};
std::string m_nntpServerPassword{"password"};
int m_nntpServerPort{119};
@ -48,8 +51,11 @@ public:
std::uint16_t BatchSize() const;
std::filesystem::path DatabasePath() const;
std::vector<std::string> FilterEraseSubtoken() const;
std::vector<std::string> FilterWordsNoSubtoken() const;
std::uint16_t MaxThreads() const;
std::uint8_t MaxTreeDepth() const;
std::uint16_t MinSubtokenWords() const;
std::string NNTPServerHost() const;
std::string NNTPServerPassword() const;
int NNTPServerPort() const;

View File

@ -26,6 +26,7 @@
#include <mutex>
#include <vector>
#include "usenetsearch/Filter.h"
#include "usenetsearch/Serialize.h"
#include "usenetsearch/UsenetClient.h"
@ -57,6 +58,7 @@ class Database
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
std::filesystem::path m_databasePath;
std::uint64_t m_databaseVersion{DatabaseVersion};
Filter& m_filter;
std::vector<std::filesystem::path> m_lockedFiles;
std::mutex m_lockedFilesMutex;
std::uint8_t m_maxTreeDepth{5};
@ -79,6 +81,10 @@ class Database
std::uint64_t newsgroupID,
std::uint32_t articleID
);
std::unique_ptr<std::vector<ArticleEntry>> LoadTokens(
const std::filesystem::path dbFile,
const std::string& subtoken
);
void OpenNewsGroupFile();
void SaveToken(
const std::string& subToken,
@ -88,6 +94,7 @@ class Database
public:
explicit Database(Filter& filter);
~Database();
std::unique_ptr<std::vector<NntpHeader>> LoadArticleList(
const std::wstring& newsgroup
@ -99,16 +106,19 @@ public:
const std::filesystem::path& dbFile,
std::function<void(const ArticleEntry& entry)> onParse
);
void UpdateArticleList(
const std::wstring& newsgroup,
const std::vector<NntpHeader>& headers
);
void UpdateNewsgroupList(const std::vector<NntpListEntry>& list);
void SaveSearchTokens(
std::uint64_t newsgroupID,
std::uint64_t articleID,
const std::string& searchString
);
std::unique_ptr<std::vector<ArticleEntry>> Search(
const std::string& searchString
);
void UpdateArticleList(
const std::wstring& newsgroup,
const std::vector<NntpHeader>& headers
);
void UpdateNewsgroupList(const std::vector<NntpListEntry>& list);
};

View File

@ -19,18 +19,32 @@
#include <codecvt>
#include <locale>
#include <regex>
#include <string>
#include <unordered_map>
#include "usenetsearch/Configuration.h"
namespace usenetsearch {
class Filter
{
Configuration& m_config;
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
std::vector<std::string> m_noSubtokenWords;
std::unordered_map<std::unique_ptr<std::wregex>, std::wstring>
m_eraseTokenRegexes;
public:
explicit Filter(Configuration& config);
void Init();
std::string ProcessSearchString(const std::string& searchString);
std::string ProcessToken(
const std::string& token,
const std::string& searchString
);
};

View File

@ -32,7 +32,6 @@ class Indexer
Application& m_app;
UsenetClient& m_client;
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
Filter m_filter;
ThreadPool m_threads;
public:

View File

@ -57,6 +57,22 @@ void Application::AddFileOption(
m_commandLineArguments.emplace_back(std::move(val));
}
void Application::AddStringOption(
char option,
const std::string& help,
std::function<void(std::string)> onParse,
std::string defaultValue)
{
auto val = std::make_shared<CommandLineOptionValue<
std::string>>();
val->type = CommandLineOptionType::String;
val->option = option;
val->helpText = help;
val->value = defaultValue;
val->onParse = onParse;
m_commandLineArguments.emplace_back(std::move(val));
}
void Application::Usage(const std::string& programName)
{
std::cout << "UsenetSearch - usenet search indexer" << std::endl;
@ -109,59 +125,67 @@ void Application::Usage(const std::string& programName)
std::cout << std::endl;
}
Application::Application()
Application::Application() : m_db(m_filter), m_filter(m_config)
{
std::cout.setf(std::ios::unitbuf);
}
Configuration& Application::Config()
Configuration& Application::GetConfig()
{
return m_config;
}
Database& Application::Db()
Database& Application::GetDb()
{
return m_db;
}
void Application::ExecuteCustomOption(
const std::shared_ptr<CommandLineOption>& opt,
std::shared_ptr<CommandLineOption>& opt,
const std::string& value)
{
switch (opt->type)
{
case CommandLineOptionType::Boolean:
{
const auto castedOption = std::dynamic_pointer_cast<
std::shared_ptr<const CommandLineOptionValue<bool>>>(opt);
std::shared_ptr<CommandLineOptionValue<bool>> castedOption =
std::dynamic_pointer_cast<CommandLineOptionValue<bool>>(
opt
);
if (castedOption == nullptr)
{
throw std::runtime_error(
"Could not cast cli arg to the correct type."
);
}
castedOption->get()->onParse(true);
castedOption->onParse(true);
}
break;
case CommandLineOptionType::String:
{
const auto castedOption = std::dynamic_pointer_cast<
std::shared_ptr<const CommandLineOptionValue<std::string>>>(
opt);
std::shared_ptr<CommandLineOptionValue<std::string>>
castedOption = std::dynamic_pointer_cast<
CommandLineOptionValue<std::string>>(
opt
);
if (castedOption == nullptr)
{
throw std::runtime_error(
"Could not cast cli arg to the correct type."
);
}
castedOption->get()->onParse(value);
castedOption->onParse(value);
}
break;
case CommandLineOptionType::Path:
{
const auto castedOption = std::dynamic_pointer_cast<
CommandLineOptionValue<
std::filesystem::path>>(opt);
std::shared_ptr<CommandLineOptionValue<
std::filesystem::path>> castedOption =
std::dynamic_pointer_cast<
CommandLineOptionValue<
std::filesystem::path
>
>(opt);
if (castedOption == nullptr)
{
throw std::runtime_error(
@ -174,6 +198,11 @@ void Application::ExecuteCustomOption(
}
}
Filter& Application::GetFilter()
{
return m_filter;
}
void Application::Init(int argc, char* argv[])
{
ParseArgs(argc, argv);
@ -182,6 +211,7 @@ void Application::Init(int argc, char* argv[])
m_config.Open(m_configFile);
m_db.MaxTreeDepth(m_config.MaxTreeDepth());
m_db.Open(m_config.DatabasePath());
m_filter.Init();
}
void Application::ParseArgs(int argc, char* argv[])
@ -212,7 +242,7 @@ void Application::ParseArgs(int argc, char* argv[])
{
// Parse custom options.
bool parsed{false};
for (const auto& optionValue: m_commandLineArguments)
for (auto optionValue: m_commandLineArguments)
{
if ((std::string{"-"} + optionValue->option) == curr_opt)
{

View File

@ -35,6 +35,16 @@ std::filesystem::path Configuration::DatabasePath() const
return m_databasePath;
}
std::vector<std::string> Configuration::FilterEraseSubtoken() const
{
return m_filterEraseSubtoken;
}
std::vector<std::string> Configuration::FilterWordsNoSubtoken() const
{
return m_filterWordsNoSubtoken;
}
std::uint16_t Configuration::MaxThreads() const
{
return m_maxThreads;
@ -45,6 +55,11 @@ std::uint8_t Configuration::MaxTreeDepth() const
return m_maxTreeDepth;
}
std::uint16_t Configuration::MinSubtokenWords() const
{
return m_minSubtokenWords;
}
std::string Configuration::NNTPServerHost() const
{
return m_nntpServerHost;
@ -110,6 +125,30 @@ void Configuration::Open(const std::string& filename)
{
m_databasePath = value;
}
else if (key == "filter_erase_subtoken")
{
const auto tokens = StringSplit(value, std::string{","});
for (const auto& token: tokens)
{
const auto trimmedToken = StringToLower(StringTrim(token));
if (trimmedToken != "")
{
m_filterEraseSubtoken.emplace_back(trimmedToken);
}
}
}
else if (key == "filter_no_subtoken")
{
const auto tokens = StringSplit(value, std::string{","});
for (const auto& token: tokens)
{
const auto trimmedToken = StringToLower(StringTrim(token));
if (trimmedToken != "")
{
m_filterWordsNoSubtoken.emplace_back(trimmedToken);
}
}
}
else if (key == "max_threads")
{
m_maxThreads = stoi(value);
@ -118,6 +157,10 @@ void Configuration::Open(const std::string& filename)
{
m_maxTreeDepth = stoi(value);
}
else if (key == "minimum_subtoken_words")
{
m_minSubtokenWords = stoi(value);
}
else if (key == "nntp_server_host")
{
m_nntpServerHost = value;

View File

@ -35,6 +35,10 @@ namespace usenetsearch {
// Database class --------------------------------------------------------------
Database::Database(Filter& filter): m_filter(filter)
{
}
Database::~Database()
{
m_newsGroupFileIO.Close();
@ -184,7 +188,9 @@ void Database::SaveSearchTokens(
" ",
m_maxTreeDepth,
[&](const std::string& subToken, const std::string& str){
SaveToken(subToken, newsgroupID, articleID);
const std::string tok = m_filter.ProcessToken(subToken, str);
if (tok.empty()) return;
SaveToken(tok, newsgroupID, articleID);
}
);
}
@ -212,6 +218,26 @@ bool Database::HasToken(
return false;
}
std::unique_ptr<std::vector<ArticleEntry>> Database::LoadTokens(
const std::filesystem::path dbFile,
const std::string& subtoken)
{
auto result = std::make_unique<std::vector<ArticleEntry>>();
if (!std::filesystem::exists(dbFile)) return result;
SerializableFile io;
io.Open(dbFile);
const std::uint64_t tokenCount = io.ReadInt64();
const auto tokenHash = StringHashBytes(subtoken);
for (std::uint64_t ntok = 0; ntok != tokenCount; ++ntok)
{
ArticleEntry entry{};
io >> entry;
if (entry.hash != tokenHash) continue;
result->emplace_back(entry);
}
return result;
}
void Database::SaveToken(
const std::string& subtoken,
std::uint64_t newsgroupID,
@ -242,7 +268,47 @@ void Database::SaveToken(
io << std::uint64_t{1};
}
// write out token.
#if 0
std::cout << "Token: " << subtoken << std::endl;
std::cout << "Saving into file: " << path << std::endl;
std::cout << "Token hash: " << HashBytesToString(token.hash) << std::endl << std::endl;
#endif
io << token;
}
std::unique_ptr<std::vector<ArticleEntry>> Database::Search(
const std::string& searchString)
{
auto result = std::make_unique<std::vector<ArticleEntry>>();
// Tokenize the search string.
std::vector<std::string> searchTokens;
StringTreeOperation(
searchString,
" ",
m_maxTreeDepth,
[&searchTokens](const std::string& subToken, const std::string&){
searchTokens.emplace_back(subToken);
}
);
for (const auto& searchToken: searchTokens)
{
const auto path = GetTokenFilePath(searchToken, false);
const bool exists = std::filesystem::exists(path);
if (!exists) continue;
const auto foundTokens = LoadTokens(path, searchToken);
if (foundTokens->empty()) continue;
result->insert(result->end(), foundTokens->begin(), foundTokens->end());
std::cout << std::left << std::setw(searchString.length() + 7)
<< "token: " + searchToken
<< std::setw(3) << " | "
<< std::setw(10)
<< "db file: " << path.string()
<< std::setw(3) << " | "
<< std::setw(9)
<< "#results: " + std::to_string(foundTokens->size())
<< std::endl;
}
return result;
}
} // namespace usenetsearch

View File

@ -16,6 +16,7 @@
*/
#include <algorithm>
#include <iostream>
#include <regex>
#include "usenetsearch/StringUtils.h"
@ -24,6 +25,34 @@
namespace usenetsearch {
Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}
{
}
void Filter::Init()
{
m_noSubtokenWords = m_config.FilterWordsNoSubtoken();
const auto eraseTokens = m_config.FilterEraseSubtoken();
// Pre-compile regexes for all the subtokens that should be erased.
std::for_each(eraseTokens.begin(), eraseTokens.end(),
[&](const std::string& tok){
const std::wstring wtok = m_conv.from_bytes(tok);
m_eraseTokenRegexes.emplace(
std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
std::wstring{}
);
m_eraseTokenRegexes.emplace(
std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),
std::wstring{L""}
);
m_eraseTokenRegexes.emplace(
std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),
std::wstring{L" "}
);
}
);
}
std::string Filter::ProcessSearchString(const std::string& searchString)
{
std::wstring str;
@ -44,8 +73,14 @@ std::string Filter::ProcessSearchString(const std::string& searchString)
// Remove Re: for obvious reasons
str = StringRemove(StringToLower(str), std::wstring{L"re:"});
// Remove punctuation and stuff by converting to whitespace
static std::wregex rxPunctuation(L"[\\.!?#$%^&~*()+\\[\\]\"-<>]+");
static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\+\\[\\]\"\\-<>]+");
str = std::regex_replace(str, rxPunctuation, L" ");
// Process erase subtoken list.
std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),
[&str](const auto& repl){
str = std::regex_replace(str, *repl.first, repl.second);
}
);
// Convert repeated whitespace to just one space.
static std::wregex rxWhitespaceMerge(L"\\s+");
str = std::regex_replace(str, rxWhitespaceMerge, L" ");
@ -66,4 +101,31 @@ std::string Filter::ProcessSearchString(const std::string& searchString)
return result;
}
std::string Filter::ProcessToken(
const std::string& token,
const std::string& searchString)
{
std::string result = token;
// Process the nosubtokens list.
if (token != searchString)
{
if (std::find(
m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)
!= m_noSubtokenWords.end()) return "";
}
// Process min subtoken word count.
const auto words = StringSplit(result, std::string{" "});
const std::uint16_t wordCount = words.size();
const std::uint16_t minWords = m_config.MinSubtokenWords();
if (minWords > 1)
{
if ((wordCount < minWords) && (token != searchString))
{
return "";
}
}
result = token;
return result;
}
} // namespace usenetsearch

View File

@ -26,19 +26,19 @@ namespace usenetsearch {
Indexer::Indexer(Application& app, UsenetClient& client)
: m_app(app), m_client(client)
{
m_threads.MaxThreads(m_app.Config().MaxThreads());
m_threads.MaxThreads(m_app.GetConfig().MaxThreads());
}
void Indexer::Connect()
{
m_client.Connect(
m_app.Config().NNTPServerHost(),
m_app.Config().NNTPServerPort(),
m_app.Config().NNTPServerSSL()
m_app.GetConfig().NNTPServerHost(),
m_app.GetConfig().NNTPServerPort(),
m_app.GetConfig().NNTPServerSSL()
);
m_client.Authenticate(
m_conv.from_bytes(m_app.Config().NNTPServerUser()),
m_conv.from_bytes(m_app.Config().NNTPServerPassword())
m_conv.from_bytes(m_app.GetConfig().NNTPServerUser()),
m_conv.from_bytes(m_app.GetConfig().NNTPServerPassword())
);
}
@ -47,7 +47,7 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
/**
* @todo Replace all stdout stuff with Logger class.
*/
const size_t batchSize = m_app.Config().BatchSize();
const size_t batchSize = m_app.GetConfig().BatchSize();
for (const auto& group: newsgroups)
{
const std::wstring newsgroup = m_conv.from_bytes(group.name);
@ -59,7 +59,7 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
<< "(.=" << batchSize << " headers)." << std::endl;
std::cout.flush();
std::atomic<std::uint64_t> headerCount{0};
std::reference_wrapper<Database> dbref = std::ref(m_app.Db());
std::reference_wrapper<Database> dbref = std::ref(m_app.GetDb());
m_client.ProcessHeaders(0,
[this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
m_threads.Queue([this, headers, &headerCount, &dbref](){
@ -67,7 +67,9 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
{
const std::uint64_t id{header.articleID};
std::string subject = header.subject;
subject = m_filter.ProcessSearchString(subject);
subject = m_app.GetFilter().ProcessSearchString(
subject
);
if (subject == "") continue;
dbref.get().SaveSearchTokens(1, id, subject);
headerCount++;

View File

@ -433,7 +433,7 @@ SerializableFile& operator<<(
out.Write(std::uint8_t{2}); // start of text
for (std::uint8_t i: obj.hash)
{
out << obj.hash[i];
out.Write(i);
}
out << obj.newsgroupID;
out << obj.articleID;

View File

@ -36,7 +36,7 @@ int main(int argc, char* argv[])
}
);
app.Init(argc, argv);
app.Db().ParseTokenFile(dbFile, [](const ArticleEntry& token){
app.GetDb().ParseTokenFile(dbFile, [](const ArticleEntry& token){
std::cout << "Hash: " << HashBytesToString(token.hash) << " | "
<< "NewsgroupID: " << token.newsgroupID << " | "
<< "ArticleID: " << token.articleID << std::endl;

View File

@ -1,5 +1,28 @@
#include <iostream>
#include "usenetsearch/Application.h"
using namespace usenetsearch;
int main(int argc, char* argv[])
{
Application app;
std::string searchString;
app.AddStringOption('s', "Search string",
[&searchString](const std::string& s){
searchString = s;
}
);
app.Init(argc, argv);
if (searchString.empty())
{
std::cerr << "Missing search string." << std::endl;
app.Usage(argv[0]);
return 1;
}
searchString = app.GetFilter().ProcessSearchString(searchString);
std::cout << searchString << std::endl;
return 0;
auto searchResults = app.GetDb().Search(searchString);
return 0;
}

View File

@ -66,7 +66,7 @@ int main(int argc, char* argv[])
std::cout << "Getting newsgroup list...";
std::cout.flush();
list = client.List();
app.Db().UpdateNewsgroupList(*list);
app.GetDb().UpdateNewsgroupList(*list);
std::cout << "DONE." << std::endl;
std::cout.flush();
}

View File

@ -1,16 +1,80 @@
# NNTP server configuration details
nntp_server_host: news.example.com
#####################################
# NNTP server configuration details #
#####################################
nntp_server_host: my.new.server.example.com
nntp_server_port: 119
nntp_server_user: configureMe
nntp_server_pass: configureMe
nntp_server_user: someuser
nntp_server_pass: changeme
nntp_server_use_ssl: no
# Index database configuration details
#################
# Path settings #
#################
# database location (relative or absolute path)
database_path: ./db
# Parallel processing settings
max_threads: 2
batch_size: 5
####################
# Storage settings #
####################
# Storage settings
max_tree_depth: 5
# A higher tree depth creates more search tokens, so it improves the speed and
# likelyhood of finding search results, at the cost of extra storage
# requirements, more files, slower indexing.
max_tree_depth: 10
################################
# Parallel processing settings #
################################
# If you're processing headers faster than you can pull them down over the
# network, you're likely not going to need more than 1 or 2 threads, but
# otherwise, more threads can help. The batch size should be large enough such
# that all configured threads have enough work to do.
#
# The higher your tree max_tree_depth, the more likely you'll need to increase
# this.
max_threads: 8
batch_size: 1000
########################
# Word filter settings #
########################
# It's important to filter out commonly used words to avoid blowing up an index
# in size. Huge indexes are going to eat a lot of disk space and slow down
# searches.
# This setting lists all substrings that should be erased from subjects and
# search strings before they are tokenized. For instance, you might not want to
# store all results for the word "the", or "in" and other stopwords.
# List of strings is comma-separated and case-insensitive. Each subsequent
# option appends to the previously defined list.
filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and
filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how
filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes
filter_erase_subtoken: yet,you,your
# This setting lets you list all tokens that will only be indexed on direct
# (whole string) matches. Each token is comma-separated, and the configuration
# option may be listed multiple times as well, each subsequent option appends to
# the previously defined list. All tokens are case-insensitive.
filter_no_subtoken: makes for,funny business
# Sets the minimum number of words in a sub-token. You may use this if you don't
# want to index single-words unless they are a direct match to the subject (in
# which case, you'd set this to a minimum of 2 words) - or you may even want a
# higher minimum than that if you're really wanting to optimize search speed and
# disk usage.
minimum_subtoken_words: 2