From bb9c3da3d88c8469b74bafc218cdc91b0ec6c3d9 Mon Sep 17 00:00:00 2001 From: John Sennesael Date: Mon, 18 Oct 2021 20:19:11 -0500 Subject: [PATCH] Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy) --- CMakeLists.txt | 12 +- include/usenetsearch/Application.h | 13 +- include/usenetsearch/Configuration.h | 9 +- include/usenetsearch/Database.h | 30 ++--- include/usenetsearch/Filter.h | 6 +- include/usenetsearch/Indexer.h | 35 +++++ include/usenetsearch/StringUtils.h | 4 + include/usenetsearch/UsenetClient.h | 20 ++- src/Application.cpp | 49 ++++++- src/Configuration.cpp | 54 +++++++- src/Database.cpp | 187 +++++++++++++++++++++------ src/Filter.cpp | 44 ++++--- src/Indexer.cpp | 152 +++++++++++++++++++++- src/Serialize.cpp | 30 ++++- src/StringUtils.cpp | 42 +++++- src/UsenetClient.cpp | 160 ++++++++++++----------- src/dbdump.cpp | 82 ++++++++++++ src/tokendump.cpp | 45 ------- src/usenetfind.cpp | 36 +++++- src/usenetindexd.cpp | 51 ++------ usenetsearch.example.conf | 44 +++++-- 21 files changed, 809 insertions(+), 296 deletions(-) create mode 100644 src/dbdump.cpp delete mode 100644 src/tokendump.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 68b4b1f..a40493d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ add_library(usenetsearch "src/Dns.cpp" "src/Except.cpp" "src/Filter.cpp" + "src/Indexer.cpp" "src/IoSocket.cpp" "src/Serialize.cpp" "src/SSLConnection.cpp" @@ -64,7 +65,6 @@ target_link_libraries(usenetsearch # Indexer executable ----------------------------------------------------------- add_executable(usenetindexd - "src/Indexer.cpp" "src/usenetindexd.cpp" ) @@ -94,18 +94,18 @@ target_include_directories(usenetfind include ) -# tokendump executable --------------------------------------------------------- +# dbdump executable ------------------------------------------------------------ -add_executable(tokendump - "src/tokendump.cpp" +add_executable(dbdump + "src/dbdump.cpp" ) -target_link_libraries(tokendump +target_link_libraries(dbdump PUBLIC ${OPENSSL_LIBRARIES} stdc++fs PRIVATE usenetsearch ) -target_include_directories(tokendump +target_include_directories(dbdump PRIVATE include ) diff --git a/include/usenetsearch/Application.h b/include/usenetsearch/Application.h index a6801a8..ecf4501 100644 --- a/include/usenetsearch/Application.h +++ b/include/usenetsearch/Application.h @@ -29,7 +29,7 @@ namespace usenetsearch { enum class CommandLineOptionType { - Boolean, Path, String + Boolean, Integer, Path, String }; struct CommandLineOption @@ -78,16 +78,25 @@ public: std::function onParse, std::filesystem::path defaultValue = "." ); + + void AddIntegerOption( + char option, + const std::string& help, + std::function onParse, + int defaultValue = 0 + ); + void AddStringOption( char option, const std::string& help, std::function onParse, std::string defaultValue = "" ); + bool CanRun() const; Configuration& GetConfig(); Database& GetDb(); Filter& GetFilter(); - void Init(int argc, char* argv[]); + bool Init(int argc, char* argv[]); void Usage(const std::string& programName); }; diff --git a/include/usenetsearch/Configuration.h b/include/usenetsearch/Configuration.h index ea0dac2..be6e3a6 100644 --- a/include/usenetsearch/Configuration.h +++ b/include/usenetsearch/Configuration.h @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include "usenetsearch/Except.h" @@ -36,6 +37,8 @@ class Configuration { std::uint16_t m_batchSize{1}; std::vector m_filterEraseSubtoken; + std::vector m_filterNewsgroupBlacklist; + std::vector m_filterNewsgroupWhitelist; std::vector m_filterWordsNoSubtoken; std::uint16_t m_maxThreads{1}; std::uint8_t m_maxTreeDepth{5}; @@ -51,8 +54,10 @@ public: std::uint16_t BatchSize() const; std::filesystem::path DatabasePath() const; - std::vector FilterEraseSubtoken() const; - std::vector FilterWordsNoSubtoken() const; + std::vector& FilterEraseSubtoken(); + std::vector& FilterNewsgroupBlacklist(); + std::vector& FilterNewsgroupWhitelist(); + std::vector& FilterWordsNoSubtoken(); std::uint16_t MaxThreads() const; std::uint8_t MaxTreeDepth() const; std::uint16_t MinSubtokenWords() const; diff --git a/include/usenetsearch/Database.h b/include/usenetsearch/Database.h index 383d483..e14b312 100644 --- a/include/usenetsearch/Database.h +++ b/include/usenetsearch/Database.h @@ -32,6 +32,7 @@ namespace usenetsearch { +class Application; static constexpr const std::uint64_t DatabaseVersion{1}; struct ArticleEntry @@ -55,27 +56,21 @@ struct DatabaseException: public UsenetSearchException class Database { - std::wstring_convert> m_conv; + Application& m_app; std::filesystem::path m_databasePath; std::uint64_t m_databaseVersion{DatabaseVersion}; - Filter& m_filter; std::vector m_lockedFiles; std::mutex m_lockedFilesMutex; std::uint8_t m_maxTreeDepth{5}; SerializableFile m_newsGroupFileIO; - bool GetArticleEntry( - const std::string& subToken, - const std::string& searchString, - ArticleEntry& entry, - size_t& startPosition, - size_t& endPosition, - size_t& count); - std::filesystem::path GetTokenFilePath( const std::string& token, bool mkdirs=false ); + std::uint64_t GetUniqueNntpEntryId( + const std::vector& list + ) const; bool HasToken( const std::string& subtoken, std::uint64_t newsgroupID, @@ -94,11 +89,10 @@ class Database public: - explicit Database(Filter& filter); + explicit Database(Application& app); ~Database(); - std::unique_ptr> LoadArticleList( - const std::wstring& newsgroup - ); + std::unique_ptr FindNntpEntry(const std::string& subject); + std::uint32_t GetLastIndexedArticle(std::uint64_t newsgroupID); std::unique_ptr> LoadNewsgroupList(); void MaxTreeDepth(std::uint8_t depth); void Open(std::filesystem::path dbPath); @@ -114,11 +108,11 @@ public: std::unique_ptr> Search( const std::string& searchString ); - void UpdateArticleList( - const std::wstring& newsgroup, - const std::vector& headers + void SetLastIndexedArticle( + std::uint64_t newsgroupID, + std::int32_t articleID ); - void UpdateNewsgroupList(const std::vector& list); + void UpdateNewsgroupList(std::vector& list); }; diff --git a/include/usenetsearch/Filter.h b/include/usenetsearch/Filter.h index aca4393..89d5e93 100644 --- a/include/usenetsearch/Filter.h +++ b/include/usenetsearch/Filter.h @@ -31,7 +31,6 @@ class Filter { Configuration& m_config; - std::wstring_convert> m_conv; std::vector m_noSubtokenWords; std::unordered_map, std::wstring> m_eraseTokenRegexes; @@ -40,11 +39,12 @@ public: explicit Filter(Configuration& config); void Init(); - std::string ProcessSearchString(const std::string& searchString); + bool ProcessNewsgroup(const std::string& newsgroup) const; + std::string ProcessSearchString(const std::string& searchString) const; std::string ProcessToken( const std::string& token, const std::string& searchString - ); + ) const; }; diff --git a/include/usenetsearch/Indexer.h b/include/usenetsearch/Indexer.h index abb9ae1..2683e6d 100644 --- a/include/usenetsearch/Indexer.h +++ b/include/usenetsearch/Indexer.h @@ -18,7 +18,9 @@ #pragma once #include +#include #include +#include #include "usenetsearch/Application.h" #include "usenetsearch/Filter.h" @@ -27,6 +29,36 @@ namespace usenetsearch { +class SearchResult +{ + + std::uint32_t m_newsgroupId{0}; + std::uint32_t m_articleId{0}; + size_t m_numHits{0}; + +public: + + SearchResult() = default; + SearchResult(const ArticleEntry& entry); + SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId); + SearchResult(const SearchResult& other); + + std::uint32_t ArticleId() const; + size_t Hits() const; + void Inc(); + std::uint32_t NewsgroupId() const; + + void operator=(const SearchResult& other); + bool operator==(const SearchResult& other) const; + bool operator!=(const SearchResult& other) const; + bool operator<(const SearchResult& other) const; + bool operator>(const SearchResult& other) const; + bool operator>=(const SearchResult& other) const; + bool operator<=(const SearchResult& other) const; +}; + +typedef std::vector SearchResults; + class Indexer { Application& m_app; @@ -40,6 +72,9 @@ public: void Connect(); void Index(const std::vector& newsgroups); + std::unique_ptr Search( + const std::string& searchString + ); }; diff --git a/include/usenetsearch/StringUtils.h b/include/usenetsearch/StringUtils.h index 5dfaa76..6e5eee9 100644 --- a/include/usenetsearch/StringUtils.h +++ b/include/usenetsearch/StringUtils.h @@ -39,6 +39,8 @@ std::string CharToHex(const char c); std::string HashBytesToString(const std::array& input); +std::string StringFromWideString(const std::wstring& input); + std::string StringHash(const std::string& input); std::array StringHashBytes(const std::string& input); @@ -137,4 +139,6 @@ void StringTreeOperation( std::function Fn ); +std::wstring WideStringFromString(const std::string& input); + } // namespace usenetsearch diff --git a/include/usenetsearch/UsenetClient.h b/include/usenetsearch/UsenetClient.h index b55a182..b2aa0d4 100644 --- a/include/usenetsearch/UsenetClient.h +++ b/include/usenetsearch/UsenetClient.h @@ -31,6 +31,8 @@ namespace usenetsearch { +class Application; + struct UsenetClientException: public UsenetSearchException { UsenetClientException(int errorCode, const std::string& message): @@ -59,16 +61,19 @@ struct NntpMessage struct NntpListEntry { - std::string name; + std::uint64_t id; + std::uint32_t lastIndexedArticle; + std::uint64_t count; std::uint64_t high; std::uint64_t low; - std::uint64_t count; + std::string name; std::string status; }; class UsenetClient { - std::wstring_convert> m_conv; + + Application& m_app; std::unique_ptr m_ssl; std::unique_ptr m_tcp; bool m_useSSL{false}; @@ -80,14 +85,7 @@ class UsenetClient public: - /* Expected flow: - * Connect - * Authenticate - * List() to get a list of newsgroups - * for every newsgroup: - * XZHDR subject 0- - * uncompress result. - */ + UsenetClient(Application& app); void Authenticate(const std::wstring& user, const std::wstring& password); diff --git a/src/Application.cpp b/src/Application.cpp index c853bc8..7c9f538 100644 --- a/src/Application.cpp +++ b/src/Application.cpp @@ -41,6 +41,21 @@ void Application::AddBooleanOption( m_commandLineArguments.emplace_back(std::move(val)); } +void Application::AddIntegerOption( + char option, + const std::string& help, + std::function onParse, + int defaultValue) +{ + auto val = std::make_shared>(); + val->type = CommandLineOptionType::Integer; + val->option = option; + val->helpText = help; + val->value = defaultValue; + val->onParse = onParse; + m_commandLineArguments.emplace_back(std::move(val)); +} + void Application::AddFileOption( char option, const std::string& help, @@ -91,6 +106,9 @@ void Application::Usage(const std::string& programName) case CommandLineOptionType::Boolean: std::cout << "[-" << optionValue->option << "] "; break; + case CommandLineOptionType::Integer: + std::cout << "[-" << optionValue->option << " ] "; + break; case CommandLineOptionType::Path: std::cout << "[-" << optionValue->option << " ] "; break; @@ -112,6 +130,10 @@ void Application::Usage(const std::string& programName) std::cout << "-" << optionValue->option << "\t" << optionValue->helpText << std::endl; break; + case CommandLineOptionType::Integer: + std::cout << "-" << optionValue->option << " \t" + << optionValue->helpText << std::endl; + break; case CommandLineOptionType::Path: std::cout << "-" << optionValue->option << " \t" << optionValue->helpText << std::endl; @@ -125,11 +147,16 @@ void Application::Usage(const std::string& programName) std::cout << std::endl; } -Application::Application() : m_db(m_filter), m_filter(m_config) +Application::Application() : m_db(*this), m_filter(m_config) { std::cout.setf(std::ios::unitbuf); } +bool Application::CanRun() const +{ + return m_canRun; +} + Configuration& Application::GetConfig() { return m_config; @@ -161,6 +188,21 @@ void Application::ExecuteCustomOption( castedOption->onParse(true); } break; + case CommandLineOptionType::Integer: + { + std::shared_ptr> castedOption = + std::dynamic_pointer_cast>( + opt + ); + if (castedOption == nullptr) + { + throw std::runtime_error( + "Could not cast cli arg to the correct type." + ); + } + castedOption->onParse(std::stoi(value)); + } + break; case CommandLineOptionType::String: { std::shared_ptr> @@ -203,15 +245,16 @@ Filter& Application::GetFilter() return m_filter; } -void Application::Init(int argc, char* argv[]) +bool Application::Init(int argc, char* argv[]) { ParseArgs(argc, argv); - if (!m_canRun) return; + if (!m_canRun) return false; // Read config, setup db m_config.Open(m_configFile); m_db.MaxTreeDepth(m_config.MaxTreeDepth()); m_db.Open(m_config.DatabasePath()); m_filter.Init(); + return true; } void Application::ParseArgs(int argc, char* argv[]) diff --git a/src/Configuration.cpp b/src/Configuration.cpp index cc21231..65a90ba 100644 --- a/src/Configuration.cpp +++ b/src/Configuration.cpp @@ -17,8 +17,10 @@ #include #include +#include #include +#include "usenetsearch/ScopeExit.h" #include "usenetsearch/StringUtils.h" #include "usenetsearch/Configuration.h" @@ -35,12 +37,22 @@ std::filesystem::path Configuration::DatabasePath() const return m_databasePath; } -std::vector Configuration::FilterEraseSubtoken() const +std::vector& Configuration::FilterEraseSubtoken() { return m_filterEraseSubtoken; } -std::vector Configuration::FilterWordsNoSubtoken() const +std::vector& Configuration::FilterNewsgroupBlacklist() +{ + return m_filterNewsgroupBlacklist; +} + +std::vector& Configuration::FilterNewsgroupWhitelist() +{ + return m_filterNewsgroupWhitelist; +} + +std::vector& Configuration::FilterWordsNoSubtoken() { return m_filterWordsNoSubtoken; } @@ -95,6 +107,7 @@ void Configuration::Open(const std::string& filename) "Could not open configuration file: " + filename ); } + ScopeExit finCloser([&fin](){ fin.close(); }); int line_nr = 0; while(std::getline(fin,line)) { @@ -108,7 +121,6 @@ void Configuration::Open(const std::string& filename) const auto kvp = StringSplit(line, std::string{":"}, 2); if (kvp.size() != 2) { - fin.close(); throw ConfigurationException(EINVAL, std::string("Invalid configuration in ") + filename + std::string(" line ") @@ -137,6 +149,40 @@ void Configuration::Open(const std::string& filename) } } } + else if (key == "filter_newsgroup_blacklist") + { + try + { + std::regex re(value); + m_filterNewsgroupBlacklist.emplace_back(re); + } + catch (const std::regex_error& e) + { + throw ConfigurationException(EINVAL, + std::string("Invalid configuration in ") + + filename + std::string(" line ") + + std::to_string(line_nr) + " : Regular expression \"" + + value + "\" did not parse: " + e.what() + ); + } + } + else if (key == "filter_newsgroup_whitelist") + { + try + { + std::regex re(value); + m_filterNewsgroupWhitelist.emplace_back(re); + } + catch (const std::regex_error& e) + { + throw ConfigurationException(EINVAL, + std::string("Invalid configuration in ") + + filename + std::string(" line ") + + std::to_string(line_nr) + " : Regular expression \"" + + value + "\" did not parse: " + e.what() + ); + } + } else if (key == "filter_no_subtoken") { const auto tokens = StringSplit(value, std::string{","}); @@ -195,7 +241,6 @@ void Configuration::Open(const std::string& filename) } else { - fin.close(); throw ConfigurationException(EINVAL, std::string("Invalid configuration in ") + filename + std::string(" line ") @@ -203,7 +248,6 @@ void Configuration::Open(const std::string& filename) ); } } - fin.close(); } } // namespace usenetsearch diff --git a/src/Database.cpp b/src/Database.cpp index 0772505..f57c41f 100644 --- a/src/Database.cpp +++ b/src/Database.cpp @@ -24,6 +24,7 @@ #include #include +#include "usenetsearch/Application.h" #include "usenetsearch/StringUtils.h" #include "usenetsearch/UsenetClient.h" #include "usenetsearch/ScopeExit.h" @@ -35,7 +36,7 @@ namespace usenetsearch { // Database class -------------------------------------------------------------- -Database::Database(Filter& filter): m_filter(filter) +Database::Database(Application& app): m_app(app) { } @@ -44,15 +45,41 @@ Database::~Database() m_newsGroupFileIO.Close(); } -bool Database::GetArticleEntry( - const std::string& subToken, - const std::string& searchString, - ArticleEntry& entry, - size_t& startPosition, - size_t& endPosition, - size_t& count) +std::unique_ptr Database::FindNntpEntry( + const std::string& subject) { - return false; + OpenNewsGroupFile(); + ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); + const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64(); + std::unique_ptr result = nullptr; + for (std::uint64_t n = 0; n != numGroups; ++n) + { + NntpListEntry entry; + m_newsGroupFileIO >> entry; + if (entry.name == subject) + { + result = std::make_unique(entry); + break; + } + } + return result; +} + +std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID) +{ + OpenNewsGroupFile(); + ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); + const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64(); + for (std::uint64_t n = 0; n != numGroups; ++n) + { + NntpListEntry entry; + m_newsGroupFileIO >> entry; + if (entry.id == newsgroupID) + { + return entry.lastIndexedArticle; + } + } + return 0; } std::filesystem::path Database::GetTokenFilePath( @@ -77,29 +104,29 @@ std::filesystem::path Database::GetTokenFilePath( return groupPath / groupFile; } +std::uint64_t Database::GetUniqueNntpEntryId( + const std::vector& list) const +{ + std::uint64_t result{0}; + for (auto& entry: list) + { + if (result <= entry.id) + { + result = entry.id + 1; + } + } + return result; +} + void Database::MaxTreeDepth(std::uint8_t depth) { m_maxTreeDepth = depth; } -std::unique_ptr> Database::LoadArticleList( - const std::wstring& newsgroup) -{ - -} - std::unique_ptr> Database::LoadNewsgroupList() { OpenNewsGroupFile(); - const std::uint64_t dbVersion = m_newsGroupFileIO.ReadInt64(); - if (dbVersion != m_databaseVersion) - { - throw DatabaseException(EINVAL, - "The loaded database version (" + std::to_string(dbVersion) - + ") does not match the current database version (" - + std::to_string(m_databaseVersion) + ")"); - } - + ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); const size_t newsGroupCount = m_newsGroupFileIO.ReadInt64(); auto result = std::make_unique>(); @@ -119,18 +146,37 @@ void Database::Open(std::filesystem::path dbPath) { std::filesystem::create_directory(dbPath); } - OpenNewsGroupFile(); } void Database::OpenNewsGroupFile() { if (m_newsGroupFileIO.IsOpen()) { + m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg); return; } const std::filesystem::path newsGroupFilePath = m_databasePath / "newsgroups.db"; + const bool exists = std::filesystem::exists(newsGroupFilePath); m_newsGroupFileIO.Open(newsGroupFilePath); + if (exists) + { + const std::uint64_t ver = m_newsGroupFileIO.ReadInt64(); + if (ver != m_databaseVersion) + { + throw DatabaseException(EBADF, + std::string{"Mismatching newgroup file database version:"} + + " have: " + std::to_string(ver) + " - want: " + + std::to_string(m_databaseVersion) + ); + } + } + else + { + m_newsGroupFileIO << m_databaseVersion; + m_newsGroupFileIO << std::uint64_t{0}; // newsgroup count. + m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg); + } } void Database::ParseTokenFile( @@ -155,26 +201,30 @@ void Database::ParseTokenFile( } } -void Database::UpdateArticleList( - const std::wstring& newsgroup, - const std::vector& headers) +void Database::SetLastIndexedArticle( + std::uint64_t newsgroupID, + std::int32_t articleID) { - -} - -void Database::UpdateNewsgroupList(const std::vector& list) -{ - OpenNewsGroupFile(); - - m_newsGroupFileIO << m_databaseVersion; - - const std::uint64_t newsGroupCount = list.size(); - m_newsGroupFileIO << newsGroupCount; - - for (const auto& entry: list) + auto outItems = LoadNewsgroupList(); + bool found{false}; + if (outItems) { - m_newsGroupFileIO << entry; + for (auto& entry: *outItems) + { + if (entry.id == newsgroupID) + { + entry.lastIndexedArticle = articleID; + found = true; + } + } } + if (!found) + { + throw DatabaseException(EINVAL, + "Attempt to update newsgroup not found in database - id: " + + std::to_string(newsgroupID)); + } + UpdateNewsgroupList(*outItems); } void Database::SaveSearchTokens( @@ -188,7 +238,10 @@ void Database::SaveSearchTokens( " ", m_maxTreeDepth, [&](const std::string& subToken, const std::string& str){ - const std::string tok = m_filter.ProcessToken(subToken, str); + const std::string tok = m_app.GetFilter().ProcessToken( + subToken, + str + ); if (tok.empty()) return; SaveToken(tok, newsgroupID, articleID); } @@ -311,4 +364,52 @@ std::unique_ptr> Database::Search( return result; } +void Database::UpdateNewsgroupList(std::vector& list) +{ + if (list.size() == 0) return; + + auto outList = LoadNewsgroupList(); + for (auto& entry: list) + { + NntpListEntry newEntry(entry); + bool found{false}; + if (outList) + { + std::for_each( + outList->begin(), + outList->end(), + [&entry, &found](NntpListEntry& oldEntry) + { + if (oldEntry.name == entry.name) + { + // update existing (copy everything but ID & name) + found = true; + oldEntry.count = entry.count; + oldEntry.high = entry.high; + oldEntry.lastIndexedArticle = entry.lastIndexedArticle; + oldEntry.low = entry.low; + oldEntry.status = entry.status; + } + } + ); + } + if (found) continue; + // add new. + newEntry.id = GetUniqueNntpEntryId(*outList); + outList->emplace_back(newEntry); + entry.id = newEntry.id; + } + OpenNewsGroupFile(); + ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); + m_newsGroupFileIO << std::uint64_t{outList->size()}; + std::for_each( + outList->begin(), + outList->end(), + [&](const NntpListEntry& e) + { + m_newsGroupFileIO << e; + } + ); +} + } // namespace usenetsearch diff --git a/src/Filter.cpp b/src/Filter.cpp index 9f57dbb..5b1317b 100644 --- a/src/Filter.cpp +++ b/src/Filter.cpp @@ -36,7 +36,7 @@ void Filter::Init() // Pre-compile regexes for all the subtokens that should be erased. std::for_each(eraseTokens.begin(), eraseTokens.end(), [&](const std::string& tok){ - const std::wstring wtok = m_conv.from_bytes(tok); + const std::wstring wtok = WideStringFromString(tok); m_eraseTokenRegexes.emplace( std::make_unique(L"^" + wtok + L"\\s+"), std::wstring{} @@ -53,17 +53,34 @@ void Filter::Init() ); } -std::string Filter::ProcessSearchString(const std::string& searchString) +bool Filter::ProcessNewsgroup(const std::string& newsgroup) const { - std::wstring str; - try + for (const auto& blackRe: m_config.FilterNewsgroupBlacklist()) { - str = m_conv.from_bytes(searchString); + std::smatch matches; + if (std::regex_match(newsgroup, matches, blackRe)) + { + if (matches.size() > 0) return false; + } } - catch (const std::range_error&) + if (m_config.FilterNewsgroupWhitelist().size() > 0) { - return ""; // string is not valid utf8 + for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist()) + { + std::smatch matches; + if (std::regex_match(newsgroup, matches, whiteRe)) + { + if (matches.size() > 0) return true; + } + } + return false; } + return true; +} + +std::string Filter::ProcessSearchString(const std::string& searchString) const +{ + std::wstring str = WideStringFromString(searchString); std::remove_if(str.begin(), str.end(), [](wchar_t c){ // Remove control characters. if (c < 0x20) return true; // ascii control chars @@ -89,21 +106,12 @@ std::string Filter::ProcessSearchString(const std::string& searchString) // Convert strings that are ONLY whitespace to blank strings. static std::wregex rxAllWhitespace(L"^\\s+$"); str = std::regex_replace(str, rxAllWhitespace, L""); - std::string result; - try - { - result = m_conv.to_bytes(str); - } - catch (const std::range_error&) - { - return ""; - } - return result; + return StringFromWideString(str); } std::string Filter::ProcessToken( const std::string& token, - const std::string& searchString) + const std::string& searchString) const { std::string result = token; // Process the nosubtokens list. diff --git a/src/Indexer.cpp b/src/Indexer.cpp index 3f390f7..dcfbb07 100644 --- a/src/Indexer.cpp +++ b/src/Indexer.cpp @@ -23,6 +23,92 @@ namespace usenetsearch { +// SearchResult class ---------------------------------------------------------- + +SearchResult::SearchResult(const ArticleEntry& entry) +{ + m_newsgroupId = entry.newsgroupID; + m_articleId = entry.articleID; +} + +SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId): + m_newsgroupId(newsgroupId), m_articleId(articleId) +{ + +} + +SearchResult::SearchResult(const SearchResult& other) +{ + m_articleId = other.m_articleId; + m_newsgroupId = other.m_newsgroupId; + m_numHits = other.m_numHits; +}; + +std::uint32_t SearchResult::ArticleId() const +{ + return m_articleId; +} + +size_t SearchResult::Hits() const +{ + return m_numHits; +} + +void SearchResult::Inc() +{ + m_numHits++; +} + +std::uint32_t SearchResult::NewsgroupId() const +{ + return m_newsgroupId; +} + +void SearchResult::operator=(const SearchResult& other) +{ + m_articleId = other.m_articleId; + m_newsgroupId = other.m_newsgroupId; + m_numHits = other.m_numHits; +} + +bool SearchResult::operator==(const SearchResult& other) const +{ + const bool result = + (m_articleId == other.m_articleId) + && (m_newsgroupId == other.m_newsgroupId); + return result; +} + +bool SearchResult::operator!=(const SearchResult& other) const +{ + return ( + (m_articleId != other.m_articleId) + || (m_newsgroupId != other.m_newsgroupId) + ); +} + +bool SearchResult::operator<(const SearchResult& other) const +{ + return m_numHits < other.m_numHits; +} + +bool SearchResult::operator>(const SearchResult& other) const +{ + return m_numHits > other.m_numHits; +} + +bool SearchResult::operator>=(const SearchResult& other) const +{ + return m_numHits >= other.m_numHits; +} + +bool SearchResult::operator<=(const SearchResult& other) const +{ + return m_numHits <= other.m_numHits; +} + +// Indexer class --------------------------------------------------------------- + Indexer::Indexer(Application& app, UsenetClient& client) : m_app(app), m_client(client) { @@ -59,20 +145,41 @@ void Indexer::Index(const std::vector& newsgroups) << "(.=" << batchSize << " headers)." << std::endl; std::cout.flush(); std::atomic headerCount{0}; + const std::atomic groupID = group.id; std::reference_wrapper dbref = std::ref(m_app.GetDb()); - m_client.ProcessHeaders(0, - [this, &headerCount, &dbref](std::shared_ptr headers){ - m_threads.Queue([this, headers, &headerCount, &dbref](){ + const std::uint32_t startMessage = dbref.get().GetLastIndexedArticle( + groupID + ); + std::cout << "Indexing starting at message: " + << std::to_string(startMessage) << std::endl; + m_client.ProcessHeaders(startMessage, + [this, &headerCount, &dbref, &groupID](std::shared_ptr headers){ + m_threads.Queue([this, headers, &headerCount, &dbref, &groupID](){ + std::uint64_t lastArticle{0}; for (const auto& header: *headers) { - const std::uint64_t id{header.articleID}; + const std::uint64_t articleID{header.articleID}; std::string subject = header.subject; subject = m_app.GetFilter().ProcessSearchString( subject ); if (subject == "") continue; - dbref.get().SaveSearchTokens(1, id, subject); + dbref.get().SaveSearchTokens( + groupID, + articleID, + subject + ); headerCount++; + if (articleID > lastArticle) lastArticle = articleID; + } + // Update last-indexed id for the newsgroup. + const std::uint32_t lastIndexedID = + dbref.get().GetLastIndexedArticle(groupID); + if (lastIndexedID < lastArticle) + { + dbref.get().SetLastIndexedArticle( + groupID, lastArticle + ); } std::cout << "."; std::cout.flush(); @@ -87,4 +194,39 @@ void Indexer::Index(const std::vector& newsgroups) } } +std::unique_ptr Indexer::Search( + const std::string& searchString) +{ + auto result = std::make_unique(); + const std::string sstr = m_app.GetFilter().ProcessSearchString( + searchString + ); + auto searchResults = m_app.GetDb().Search(sstr); + if (!searchResults) return result; + for(const ArticleEntry& entry: *searchResults) + { + SearchResult sr(entry); + // Check if a matching entry already exists in the result set, if so, + // increment count. Otherwise, append a new entry. + auto it = std::find(result->begin(), result->end(), sr); + if (it != result->end()) + { + (*it).Inc(); + } + else + { + sr.Inc(); + result->emplace_back(sr); + } + } + std::sort( + result->begin(), + result->end(), + [](const SearchResult& a, const SearchResult& b){ + return a.Hits() > b.Hits(); + } + ); + return result; +} + } // namespace usenetsearch diff --git a/src/Serialize.cpp b/src/Serialize.cpp index 584de90..cfeac42 100644 --- a/src/Serialize.cpp +++ b/src/Serialize.cpp @@ -170,7 +170,7 @@ std::string SerializableFile::ReadStr(size_t size) const RangeUnlock(startPos, size); }); size_t bytesRead{0}; - std::string result(size + 1, '\0'); + std::string result(size, '\0'); while (bytesRead < size) { const auto readNow = read(m_fd, &result[0], size); @@ -485,21 +485,49 @@ SerializableFile& operator>>(SerializableFile& in, NntpHeader& obj) SerializableFile& operator<<(SerializableFile& out, const NntpListEntry& obj) { + out.Write(std::uint8_t{1}); // start of heading + out.Write(std::uint8_t{2}); // start of text + out << obj.id; + out << obj.lastIndexedArticle; out << obj.count; out << obj.high; out << obj.low; out << obj.name; out << obj.status; + out.Write(std::uint8_t{3}); // end of text + out.Write(std::uint8_t{4}); // end of transmission return out; } SerializableFile& operator>>(SerializableFile& in, NntpListEntry& obj) { + std::uint8_t SOH{}; + std::uint8_t STX{}; + std::uint8_t ETX{}; + std::uint8_t EOT{}; + in >> SOH; + in >> STX; + if ((SOH != 1) || (STX != 2)) + { + throw SerializeException(EBADMSG, + "Bad magic number in NNTP entry header." + ); + } + in >> obj.id; + in >> obj.lastIndexedArticle; in >> obj.count; in >> obj.high; in >> obj.low; in >> obj.name; in >> obj.status; + in >> ETX; + in >> EOT; + if ((ETX != 3) || (EOT != 4)) + { + throw SerializeException(EBADMSG, + "Bad magic number in NNTP entry footer." + ); + } return in; } diff --git a/src/StringUtils.cpp b/src/StringUtils.cpp index 46fa8e2..a323da7 100644 --- a/src/StringUtils.cpp +++ b/src/StringUtils.cpp @@ -30,6 +30,31 @@ namespace usenetsearch { +static std::wstring_convert> conv; + +std::string CharToHex(const char c) +{ + const int val = c; + std::ostringstream result; + result << std::setw(2) << std::setfill('0') << std::hex; + result << val; + return result.str(); +} + +std::string StringFromWideString(const std::wstring& input) +{ + std::string result; + try + { + result = conv.to_bytes(input); + } + catch(const std::range_error&) + { + return ""; + } + return result; +} + std::string StringHash(const std::string& input) { unsigned char result[MD5_DIGEST_LENGTH]; @@ -123,13 +148,18 @@ void StringTreeOperation( } } -std::string CharToHex(const char c) +std::wstring WideStringFromString(const std::string& input) { - const int val = c; - std::ostringstream result; - result << std::setw(2) << std::setfill('0') << std::hex; - result << val; - return result.str(); + std::wstring result; + try + { + result = conv.from_bytes(input); + } + catch(const std::range_error&) + { + return L""; + } + return result; } } // namespace usenetsearch diff --git a/src/UsenetClient.cpp b/src/UsenetClient.cpp index 811b6a2..b2abc56 100644 --- a/src/UsenetClient.cpp +++ b/src/UsenetClient.cpp @@ -22,6 +22,7 @@ #include #include +#include "usenetsearch/Application.h" #include "usenetsearch/Except.h" #include "usenetsearch/StringUtils.h" @@ -31,6 +32,10 @@ namespace usenetsearch { // UsenetClient class ---------------------------------------------------------- +UsenetClient::UsenetClient(Application& app): m_app(app) +{ +} + void UsenetClient::Authenticate( const std::wstring& user, const std::wstring& password) @@ -105,7 +110,7 @@ void UsenetClient::Group(const std::wstring& newsgroup) { throw UsenetClientException( response.code, - "Error changing group to " + m_conv.to_bytes(newsgroup) + " : " + "Error changing group to " + StringFromWideString(newsgroup) + " : " + response.message ); } @@ -145,6 +150,86 @@ NntpHeader UsenetClient::Head(std::uint64_t articleID) return result; } +bool UsenetClient::IsError(const NntpMessage& msg) const +{ + if (msg.code >= 400) return true; + return false; +} + +std::unique_ptr> UsenetClient::List() +{ + Write(L"LIST COUNTS\r\n"); + /* In response, we should get a 215 response followed by the list of news + groups ending in a period on it's own line. */ + const auto response = ReadLine(); + if (IsError(response)) + { + throw UsenetClientException( + response.code, + "Failed to fetch newsgroup list from server, " + + std::string{"server responded with: "} + + response.message + ); + } + const auto listStr = ReadUntil("\r\n.\r\n"); + // parse the list. + auto lines = StringSplit(listStr, std::string{"\r\n"}); + auto result = std::make_unique>(); + if (lines.empty()) return result; + for (const auto& line: lines) + { + NntpListEntry entry; + const auto fields = StringSplit(line, std::string{" "}); + if (fields.size() == 5) + { + entry.name = fields[0]; + entry.high = std::stoul(fields[1]); + entry.low = std::stoul(fields[2]); + entry.count = std::stoul(fields[3]); + entry.status = fields[4]; + entry.id = 0; // incremented by db when saving. + entry.lastIndexedArticle = 0; + if (m_app.GetFilter().ProcessNewsgroup(entry.name)) + { + result->emplace_back(entry); + } + } + } + return result; +} + +std::unique_ptr> UsenetClient::ListGroup( + const std::wstring& newsGroup) +{ + auto result = std::make_unique>(); + if (!m_app.GetFilter().ProcessNewsgroup(StringFromWideString(newsGroup))) + { + return result; + } + Write(L"LISTGROUP " + newsGroup + L"\r\n"); + /* In response, we should get a 211 response followed by the list of + article ID's ending in a period on it's own line. */ + const auto response = ReadLine(); + if (IsError(response)) + { + throw UsenetClientException( + response.code, + "Failed to fetch newsgroup list from server, " + + std::string{"server responded with: "} + + response.message + ); + } + const auto listStr = ReadUntil("\r\n.\r\n"); + // parse the list. + auto lines = StringSplit(listStr, std::string{"\r\n"}); + if (lines.empty()) return result; + for (const auto& line: lines) + { + result->emplace_back(stoul(StringTrim(line))); + } + return result; +} + void UsenetClient::ProcessHeaders( std::uint64_t startMessage, std::function)> processFn, @@ -201,77 +286,6 @@ void UsenetClient::ProcessHeaders( } } -bool UsenetClient::IsError(const NntpMessage& msg) const -{ - if (msg.code >= 400) return true; - return false; -} - -std::unique_ptr> UsenetClient::List() -{ - Write(L"LIST COUNTS\r\n"); - /* In response, we should get a 215 response followed by the list of news - groups ending in a period on it's own line. */ - const auto response = ReadLine(); - if (IsError(response)) - { - throw UsenetClientException( - response.code, - "Failed to fetch newsgroup list from server, " - + std::string{"server responded with: "} - + response.message - ); - } - const auto listStr = ReadUntil("\r\n.\r\n"); - // parse the list. - auto lines = StringSplit(listStr, std::string{"\r\n"}); - auto result = std::make_unique>(); - if (lines.empty()) return result; - for (const auto& line: lines) - { - NntpListEntry entry; - const auto fields = StringSplit(line, std::string{" "}); - if (fields.size() == 5) - { - entry.name = fields[0]; - entry.high = std::stoul(fields[1]); - entry.low = std::stoul(fields[2]); - entry.count = std::stoul(fields[3]); - entry.status = fields[4]; - result->emplace_back(entry); - } - } - return result; -} - -std::unique_ptr> UsenetClient::ListGroup( - const std::wstring& newsGroup) -{ - Write(L"LISTGROUP " + newsGroup + L"\r\n"); - /* In response, we should get a 211 response followed by the list of - article ID's ending in a period on it's own line. */ - const auto response = ReadLine(); - if (IsError(response)) - { - throw UsenetClientException( - response.code, - "Failed to fetch newsgroup list from server, " - + std::string{"server responded with: "} - + response.message - ); - } - const auto listStr = ReadUntil("\r\n.\r\n"); - // parse the list. - auto lines = StringSplit(listStr, std::string{"\r\n"}); - auto result = std::make_unique>(); - if (lines.empty()) return result; - for (const auto& line: lines) - { - result->emplace_back(stoul(StringTrim(line))); - } - return result; -} - NntpMessage UsenetClient::ReadLine() { NntpMessage result{}; @@ -306,7 +320,7 @@ std::string UsenetClient::ReadUntil(const std::string& deliminator) void UsenetClient::Write(const std::wstring& message) { - const std::string toSend = m_conv.to_bytes(message); + const std::string toSend = StringFromWideString(message); if (m_useSSL) { m_ssl->Write(toSend); diff --git a/src/dbdump.cpp b/src/dbdump.cpp new file mode 100644 index 0000000..f82565b --- /dev/null +++ b/src/dbdump.cpp @@ -0,0 +1,82 @@ +/* + Copyright© 2021 John Sennesael + + UsenetSearch is Free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + UsenetSearch is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with UsenetSearch. If not, see . +*/ + +#include +#include + +#include "usenetsearch/Application.h" +#include "usenetsearch/StringUtils.h" + +using namespace usenetsearch; + +int main(int argc, char* argv[]) +{ + Application app; + std::string tokenFile{""}; + std::string newsgroupFile{""}; + app.AddFileOption( + 't', + "token db file to dump.", + [&tokenFile](const std::string& val) + { + tokenFile = val; + } + ); + app.AddFileOption( + 'n', + "newsgroup file to dump.", + [&newsgroupFile](const std::string& val) + { + newsgroupFile = val; + } + ); + if (!app.Init(argc, argv)) return 1; + if (!tokenFile.empty()) + { + app.GetDb().ParseTokenFile(tokenFile, [](const ArticleEntry& token){ + std::cout << "Hash: " << HashBytesToString(token.hash) << " | " + << "NewsgroupID: " << token.newsgroupID << " | " + << "ArticleID: " << token.articleID << std::endl; + }); + } + if (!newsgroupFile.empty()) + { + const auto groups = app.GetDb().LoadNewsgroupList(); + for(const auto& group: *groups) + { + std::cout << std::left + << std::setw(9) << "Id: " + std::to_string(group.id) + << std::setw(3) << " | " + << std::setw(27) << "LastIndexedMsgId: " + + std::to_string(group.lastIndexedArticle) + << std::setw(3) << " | " + << std::setw(14) << "Count: " + + std::to_string(group.count) + << std::setw(3) << " | " + << std::setw(13) << "High: " + std::to_string(group.high) + << std::setw(3) << " | " + << std::setw(8) << "Low: " + std::to_string(group.low) + << std::setw(3) << " | " + << std::setw(9) << "Status: " + group.status + << std::setw(3) << " | " + << std::setw(group.name.size() + 5) + << "Name: " + group.name + << std::endl; + } + } + return 0; +} diff --git a/src/tokendump.cpp b/src/tokendump.cpp deleted file mode 100644 index 84afd25..0000000 --- a/src/tokendump.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/* - Copyright© 2021 John Sennesael - - UsenetSearch is Free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - UsenetSearch is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with UsenetSearch. If not, see . -*/ - -#include -#include - -#include "usenetsearch/Application.h" -#include "usenetsearch/StringUtils.h" - -using namespace usenetsearch; - -int main(int argc, char* argv[]) -{ - Application app; - std::string dbFile{""}; - app.AddFileOption( - 'd', - "token db file to dump.", - [&dbFile](const std::string& val) - { - dbFile = val; - } - ); - app.Init(argc, argv); - app.GetDb().ParseTokenFile(dbFile, [](const ArticleEntry& token){ - std::cout << "Hash: " << HashBytesToString(token.hash) << " | " - << "NewsgroupID: " << token.newsgroupID << " | " - << "ArticleID: " << token.articleID << std::endl; - }); - return 0; -} diff --git a/src/usenetfind.cpp b/src/usenetfind.cpp index 7f4b8ec..c642bc8 100644 --- a/src/usenetfind.cpp +++ b/src/usenetfind.cpp @@ -1,6 +1,8 @@ #include #include "usenetsearch/Application.h" +#include "usenetsearch/UsenetClient.h" +#include "usenetsearch/Indexer.h" using namespace usenetsearch; @@ -13,15 +15,41 @@ int main(int argc, char* argv[]) searchString = s; } ); - app.Init(argc, argv); + int maxResults{0}; + app.AddIntegerOption('n', "Maximum results", + [&maxResults](int n){ + maxResults = n; + } + ); + if (!app.Init(argc, argv)) return 1; if (searchString.empty()) { std::cerr << "Missing search string." << std::endl; app.Usage(argv[0]); return 1; } - searchString = app.GetFilter().ProcessSearchString(searchString); - - auto searchResults = app.GetDb().Search(searchString); + UsenetClient client(app); + Indexer idx(app, client); + std::unique_ptr results = idx.Search( + searchString + ); + if (!results) + { + std::cout << "Nothing found." << std::endl; + return 0; + } + size_t resultCounter{0}; + for (const auto& sr: *results) + { + std::cout << std::left + << std::setw(18) << "Newsgroup id: " + std::to_string(sr.NewsgroupId()) + << std::setw(4) << " | " + << std::setw(17) << "Article id: " + std::to_string(sr.ArticleId()) + << std::setw(4) << " | " + << std::setw(10) << "Hits: " + std::to_string(sr.Hits()) + << std::endl; + resultCounter++; + if ((maxResults > 0) && (resultCounter >= maxResults)) break; + } return 0; } diff --git a/src/usenetindexd.cpp b/src/usenetindexd.cpp index 08f176d..4233435 100644 --- a/src/usenetindexd.cpp +++ b/src/usenetindexd.cpp @@ -27,53 +27,24 @@ using namespace usenetsearch; int main(int argc, char* argv[]) { Application app; - app.Init(argc, argv); - - UsenetClient client; + if (!app.Init(argc, argv)) return 1; + UsenetClient client(app); Indexer indexer(app, client); + std::cout << "Connecting to newsgroup server..."; indexer.Connect(); - + std::cout << "" << std::endl; try { - // BEGIN TEMPORARY TEST CODE - std::wstring_convert> conv; - std::unique_ptr> list; - NntpListEntry e{}; - e.count = 100; - -// 1001 headers -// e.name = "comp.os.os2.comm"; - -// 2541 headers -// e.name = "borland.public.cppbuilder.commandlinetools"; - -// 100026 headers (1859952 K) (1816.35 M) -// e.name = "dk.videnskab"; -// 1000437 headers - e.name = "alt.bible"; - -// a million or so, but this one is very slow because all subjects look the -// same, so everything goes to the same token index, which means we're -// constantly waiting on a file lock. -// e.name = "usenetserver.test"; - - list = std::make_unique>(); - list->emplace_back(e); - if ((list == nullptr) || (list->empty())) - { - std::cout << "Getting newsgroup list..."; - std::cout.flush(); - list = client.List(); - app.GetDb().UpdateNewsgroupList(*list); - std::cout << "DONE." << std::endl; - std::cout.flush(); - } - std::cout << "Number of newsgroups in newsgroup: " - << list->size() << std::endl; + std::cout << "Getting newsgroup list..."; + std::cout.flush(); + auto list = client.List(); + app.GetDb().UpdateNewsgroupList(*list); + std::cout << "" << std::endl; + std::cout.flush(); + std::cout << "Found " << list->size() << " newsgroups." << std::endl; std::cout.flush(); - // END TEMPORARY TEST CODE indexer.Index(*list); } catch (const UsenetSearchException& e) diff --git a/usenetsearch.example.conf b/usenetsearch.example.conf index 9ba0b00..4130122 100644 --- a/usenetsearch.example.conf +++ b/usenetsearch.example.conf @@ -38,8 +38,30 @@ max_tree_depth: 10 # The higher your tree max_tree_depth, the more likely you'll need to increase # this. -max_threads: 8 -batch_size: 1000 +max_threads: 16 +batch_size: 10000 + +############################# +# Newsgroup filter settings # +############################# + +# List one or more newsgroup regular expressions to include or exclude from +# being indexed. Blacklisted patterns take precedence over whitelisted patterns. +# These options may be repeated to include additional blacklist/whitelist +# regular expressions. + +# If filter_newsgroup_whitelist is set, only newsgroups matching the configured +# regular expressions will be included in indexing. +# If not set, all of usenet will be indexed (with the exeption of +# filter_newsgroup_blacklist groups) + +filter_newsgroup_whitelist: ^alt\.bible$ +filter_newsgroup_whitelist: ^borland\.public\.cppbuilder\.* + +# filter_newsgroup_blacklist allows you to exclude newsgroups from being +# indexed, whether filter_newsgroup_whitlelist is set or not. + +filter_newsgroup_blacklist: .*binaries.* ######################## # Word filter settings # @@ -55,21 +77,21 @@ batch_size: 1000 # List of strings is comma-separated and case-insensitive. Each subsequent # option appends to the previously defined list. -filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and -filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do -filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how -filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my -filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where -filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom -filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes -filter_erase_subtoken: yet,you,your +# filter_erase_subtoken: the,by # This setting lets you list all tokens that will only be indexed on direct # (whole string) matches. Each token is comma-separated, and the configuration # option may be listed multiple times as well, each subsequent option appends to # the previously defined list. All tokens are case-insensitive. -filter_no_subtoken: makes for,funny business +filter_no_subtoken: a,about,actually,almost,also,although,always,am,an,and +filter_no_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do +filter_no_subtoken: does,each,either,else,for,from,had,has,have,hence,how +filter_no_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my +filter_no_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where +filter_no_subtoken: whereas,wherever,whenever,whether,which,while,who,whom +filter_no_subtoken: whoever,whose,why,will,with,within,without,would,yes +filter_no_subtoken: yet,you,your # Sets the minimum number of words in a sub-token. You may use this if you don't # want to index single-words unless they are a direct match to the subject (in