From bb9c3da3d88c8469b74bafc218cdc91b0ec6c3d9 Mon Sep 17 00:00:00 2001
From: John Sennesael <john@adminking.com>
Date: Mon, 18 Oct 2021 20:19:11 -0500
Subject: [PATCH] Implemented newsgroup filtering, work on resuming indexing
 where last left off (still buggy)

---
 CMakeLists.txt                       |  12 +-
 include/usenetsearch/Application.h   |  13 +-
 include/usenetsearch/Configuration.h |   9 +-
 include/usenetsearch/Database.h      |  30 ++---
 include/usenetsearch/Filter.h        |   6 +-
 include/usenetsearch/Indexer.h       |  35 +++++
 include/usenetsearch/StringUtils.h   |   4 +
 include/usenetsearch/UsenetClient.h  |  20 ++-
 src/Application.cpp                  |  49 ++++++-
 src/Configuration.cpp                |  54 +++++++-
 src/Database.cpp                     | 187 +++++++++++++++++++++------
 src/Filter.cpp                       |  44 ++++---
 src/Indexer.cpp                      | 152 +++++++++++++++++++++-
 src/Serialize.cpp                    |  30 ++++-
 src/StringUtils.cpp                  |  42 +++++-
 src/UsenetClient.cpp                 | 160 ++++++++++++-----------
 src/dbdump.cpp                       |  82 ++++++++++++
 src/tokendump.cpp                    |  45 -------
 src/usenetfind.cpp                   |  36 +++++-
 src/usenetindexd.cpp                 |  51 ++------
 usenetsearch.example.conf            |  44 +++++--
 21 files changed, 809 insertions(+), 296 deletions(-)
 create mode 100644 src/dbdump.cpp
 delete mode 100644 src/tokendump.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 68b4b1f..a40493d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,6 +41,7 @@ add_library(usenetsearch
     "src/Dns.cpp"
     "src/Except.cpp"
     "src/Filter.cpp"
+    "src/Indexer.cpp"
     "src/IoSocket.cpp"
     "src/Serialize.cpp"
     "src/SSLConnection.cpp"
@@ -64,7 +65,6 @@ target_link_libraries(usenetsearch
 # Indexer executable -----------------------------------------------------------
 
 add_executable(usenetindexd
-    "src/Indexer.cpp"
     "src/usenetindexd.cpp"
 )
 
@@ -94,18 +94,18 @@ target_include_directories(usenetfind
         include
 )
 
-# tokendump executable ---------------------------------------------------------
+# dbdump executable ------------------------------------------------------------
 
-add_executable(tokendump
-    "src/tokendump.cpp"
+add_executable(dbdump
+    "src/dbdump.cpp"
 )
 
-target_link_libraries(tokendump
+target_link_libraries(dbdump
     PUBLIC ${OPENSSL_LIBRARIES} stdc++fs
     PRIVATE usenetsearch
 )
 
-target_include_directories(tokendump
+target_include_directories(dbdump
     PRIVATE
         include
 )
diff --git a/include/usenetsearch/Application.h b/include/usenetsearch/Application.h
index a6801a8..ecf4501 100644
--- a/include/usenetsearch/Application.h
+++ b/include/usenetsearch/Application.h
@@ -29,7 +29,7 @@ namespace usenetsearch {
 
 enum class CommandLineOptionType
 {
-    Boolean, Path, String
+    Boolean, Integer, Path, String
 };
 
 struct CommandLineOption
@@ -78,16 +78,25 @@ public:
         std::function<void(std::filesystem::path)> onParse,
         std::filesystem::path defaultValue = "."
     );
+
+    void AddIntegerOption(
+        char option,
+        const std::string& help,
+        std::function<void(int)> onParse,
+        int defaultValue = 0
+    );
+
     void AddStringOption(
         char option,
         const std::string& help,
         std::function<void(std::string)> onParse,
         std::string defaultValue = ""
     );
+    bool CanRun() const;
     Configuration& GetConfig();
     Database& GetDb();
     Filter& GetFilter();
-    void Init(int argc, char* argv[]);
+    bool Init(int argc, char* argv[]);
     void Usage(const std::string& programName);
 };
 
diff --git a/include/usenetsearch/Configuration.h b/include/usenetsearch/Configuration.h
index ea0dac2..be6e3a6 100644
--- a/include/usenetsearch/Configuration.h
+++ b/include/usenetsearch/Configuration.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <filesystem>
+#include <regex>
 #include <string>
 
 #include "usenetsearch/Except.h"
@@ -36,6 +37,8 @@ class Configuration
 {
     std::uint16_t m_batchSize{1};
     std::vector<std::string> m_filterEraseSubtoken;
+    std::vector<std::regex> m_filterNewsgroupBlacklist;
+    std::vector<std::regex> m_filterNewsgroupWhitelist;
     std::vector<std::string> m_filterWordsNoSubtoken;
     std::uint16_t m_maxThreads{1};
     std::uint8_t m_maxTreeDepth{5};
@@ -51,8 +54,10 @@ public:
 
     std::uint16_t BatchSize() const;
     std::filesystem::path DatabasePath() const;
-    std::vector<std::string> FilterEraseSubtoken() const;
-    std::vector<std::string> FilterWordsNoSubtoken() const;
+    std::vector<std::string>& FilterEraseSubtoken();
+    std::vector<std::regex>& FilterNewsgroupBlacklist();
+    std::vector<std::regex>& FilterNewsgroupWhitelist();
+    std::vector<std::string>& FilterWordsNoSubtoken();
     std::uint16_t MaxThreads() const;
     std::uint8_t MaxTreeDepth() const;
     std::uint16_t MinSubtokenWords() const;
diff --git a/include/usenetsearch/Database.h b/include/usenetsearch/Database.h
index 383d483..e14b312 100644
--- a/include/usenetsearch/Database.h
+++ b/include/usenetsearch/Database.h
@@ -32,6 +32,7 @@
 
 namespace usenetsearch {
 
+class Application;
 static constexpr const std::uint64_t DatabaseVersion{1};
 
 struct ArticleEntry
@@ -55,27 +56,21 @@ struct DatabaseException: public UsenetSearchException
 class Database
 {
 
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
+    Application& m_app;
     std::filesystem::path m_databasePath;
     std::uint64_t m_databaseVersion{DatabaseVersion};
-    Filter& m_filter;
     std::vector<std::filesystem::path> m_lockedFiles;
     std::mutex m_lockedFilesMutex;
     std::uint8_t m_maxTreeDepth{5};
     SerializableFile m_newsGroupFileIO;
 
-    bool GetArticleEntry(
-        const std::string& subToken,
-        const std::string& searchString,
-        ArticleEntry& entry,
-        size_t& startPosition,
-        size_t& endPosition,
-        size_t& count);
-
     std::filesystem::path GetTokenFilePath(
         const std::string& token,
         bool mkdirs=false
     );
+    std::uint64_t GetUniqueNntpEntryId(
+        const std::vector<NntpListEntry>& list
+    ) const;
     bool HasToken(
         const std::string& subtoken,
         std::uint64_t newsgroupID,
@@ -94,11 +89,10 @@ class Database
 
 public:
 
-    explicit Database(Filter& filter);
+    explicit Database(Application& app);
     ~Database();
-    std::unique_ptr<std::vector<NntpHeader>> LoadArticleList(
-        const std::wstring& newsgroup
-    );
+    std::unique_ptr<NntpListEntry> FindNntpEntry(const std::string& subject);
+    std::uint32_t GetLastIndexedArticle(std::uint64_t newsgroupID);
     std::unique_ptr<std::vector<NntpListEntry>> LoadNewsgroupList();
     void MaxTreeDepth(std::uint8_t depth);
     void Open(std::filesystem::path dbPath);
@@ -114,11 +108,11 @@ public:
     std::unique_ptr<std::vector<ArticleEntry>> Search(
         const std::string& searchString
     );
-    void UpdateArticleList(
-        const std::wstring& newsgroup,
-        const std::vector<NntpHeader>& headers
+    void SetLastIndexedArticle(
+        std::uint64_t newsgroupID,
+        std::int32_t articleID
     );
-    void UpdateNewsgroupList(const std::vector<NntpListEntry>& list);
+    void UpdateNewsgroupList(std::vector<NntpListEntry>& list);
 
 };
 
diff --git a/include/usenetsearch/Filter.h b/include/usenetsearch/Filter.h
index aca4393..89d5e93 100644
--- a/include/usenetsearch/Filter.h
+++ b/include/usenetsearch/Filter.h
@@ -31,7 +31,6 @@ class Filter
 {
 
     Configuration& m_config;
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
     std::vector<std::string> m_noSubtokenWords;
     std::unordered_map<std::unique_ptr<std::wregex>, std::wstring>
         m_eraseTokenRegexes;
@@ -40,11 +39,12 @@ public:
 
     explicit Filter(Configuration& config);
     void Init();
-    std::string ProcessSearchString(const std::string& searchString);
+    bool ProcessNewsgroup(const std::string& newsgroup) const;
+    std::string ProcessSearchString(const std::string& searchString) const;
     std::string ProcessToken(
         const std::string& token,
         const std::string& searchString
-    );
+    ) const;
 
 };
 
diff --git a/include/usenetsearch/Indexer.h b/include/usenetsearch/Indexer.h
index abb9ae1..2683e6d 100644
--- a/include/usenetsearch/Indexer.h
+++ b/include/usenetsearch/Indexer.h
@@ -18,7 +18,9 @@
 #pragma once
 
 #include <codecvt>
+#include <cstdint>
 #include <locale>
+#include <vector>
 
 #include "usenetsearch/Application.h"
 #include "usenetsearch/Filter.h"
@@ -27,6 +29,36 @@
 
 namespace usenetsearch {
 
+class SearchResult
+{
+
+    std::uint32_t m_newsgroupId{0};
+    std::uint32_t m_articleId{0};
+    size_t m_numHits{0};
+
+public:
+
+    SearchResult() = default;
+    SearchResult(const ArticleEntry& entry);
+    SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId);
+    SearchResult(const SearchResult& other);
+
+    std::uint32_t ArticleId() const;
+    size_t Hits() const;
+    void Inc();
+    std::uint32_t NewsgroupId() const;
+
+    void operator=(const SearchResult& other);
+    bool operator==(const SearchResult& other) const;
+    bool operator!=(const SearchResult& other) const;
+    bool operator<(const SearchResult& other) const;
+    bool operator>(const SearchResult& other) const;
+    bool operator>=(const SearchResult& other) const;
+    bool operator<=(const SearchResult& other) const;
+};
+
+typedef std::vector<SearchResult> SearchResults;
+
 class Indexer
 {
     Application& m_app;
@@ -40,6 +72,9 @@ public:
 
     void Connect();
     void Index(const std::vector<NntpListEntry>& newsgroups);
+    std::unique_ptr<SearchResults> Search(
+        const std::string& searchString
+    );
 
 };
 
diff --git a/include/usenetsearch/StringUtils.h b/include/usenetsearch/StringUtils.h
index 5dfaa76..6e5eee9 100644
--- a/include/usenetsearch/StringUtils.h
+++ b/include/usenetsearch/StringUtils.h
@@ -39,6 +39,8 @@ std::string CharToHex(const char c);
 
 std::string HashBytesToString(const std::array<std::uint8_t, 16>& input);
 
+std::string StringFromWideString(const std::wstring& input);
+
 std::string StringHash(const std::string& input);
 
 std::array<std::uint8_t, 16> StringHashBytes(const std::string& input);
@@ -137,4 +139,6 @@ void StringTreeOperation(
     std::function<void(const std::string& subToken, const std::string& str)> Fn
 );
 
+std::wstring WideStringFromString(const std::string& input);
+
 } // namespace usenetsearch
diff --git a/include/usenetsearch/UsenetClient.h b/include/usenetsearch/UsenetClient.h
index b55a182..b2aa0d4 100644
--- a/include/usenetsearch/UsenetClient.h
+++ b/include/usenetsearch/UsenetClient.h
@@ -31,6 +31,8 @@
 
 namespace usenetsearch {
 
+class Application;
+
 struct UsenetClientException: public UsenetSearchException
 {
     UsenetClientException(int errorCode, const std::string& message):
@@ -59,16 +61,19 @@ struct NntpMessage
 
 struct NntpListEntry
 {
-    std::string name;
+    std::uint64_t id;
+    std::uint32_t lastIndexedArticle;
+    std::uint64_t count;
     std::uint64_t high;
     std::uint64_t low;
-    std::uint64_t count;
+    std::string name;
     std::string status;
 };
 
 class UsenetClient
 {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
+
+    Application& m_app;
     std::unique_ptr<SSLConnection> m_ssl;
     std::unique_ptr<TcpConnection> m_tcp;
     bool m_useSSL{false};
@@ -80,14 +85,7 @@ class UsenetClient
 
 public:
 
-    /* Expected flow:
-        * Connect
-        * Authenticate
-        * List() to get a list of newsgroups
-        * for every newsgroup:
-          * XZHDR subject 0-
-          * uncompress result.
-    */
+    UsenetClient(Application& app);
 
     void Authenticate(const std::wstring& user, const std::wstring& password);
 
diff --git a/src/Application.cpp b/src/Application.cpp
index c853bc8..7c9f538 100644
--- a/src/Application.cpp
+++ b/src/Application.cpp
@@ -41,6 +41,21 @@ void Application::AddBooleanOption(
     m_commandLineArguments.emplace_back(std::move(val));
 }
 
+void Application::AddIntegerOption(
+    char option,
+    const std::string& help,
+    std::function<void(int)> onParse,
+    int defaultValue)
+{
+    auto val = std::make_shared<CommandLineOptionValue<int>>();
+    val->type = CommandLineOptionType::Integer;
+    val->option = option;
+    val->helpText = help;
+    val->value = defaultValue;
+    val->onParse = onParse;
+    m_commandLineArguments.emplace_back(std::move(val));
+}
+
 void Application::AddFileOption(
     char option,
     const std::string& help,
@@ -91,6 +106,9 @@ void Application::Usage(const std::string& programName)
             case CommandLineOptionType::Boolean:
                 std::cout << "[-" << optionValue->option << "] ";
                 break;
+            case CommandLineOptionType::Integer:
+                std::cout << "[-" << optionValue->option << " <number>] ";
+                break;
             case CommandLineOptionType::Path:
                 std::cout << "[-" << optionValue->option << " <path>] ";
                 break;
@@ -112,6 +130,10 @@ void Application::Usage(const std::string& programName)
                 std::cout << "-" << optionValue->option << "\t"
                           << optionValue->helpText << std::endl;
                 break;
+            case CommandLineOptionType::Integer:
+                std::cout << "-" << optionValue->option << " <number>\t"
+                          << optionValue->helpText << std::endl;
+                break;
             case CommandLineOptionType::Path:
                 std::cout << "-" << optionValue->option << " <path>\t"
                           << optionValue->helpText << std::endl;
@@ -125,11 +147,16 @@ void Application::Usage(const std::string& programName)
     std::cout << std::endl;
 }
 
-Application::Application() : m_db(m_filter), m_filter(m_config)
+Application::Application() : m_db(*this), m_filter(m_config)
 {
     std::cout.setf(std::ios::unitbuf);
 }
 
+bool Application::CanRun() const
+{
+    return m_canRun;
+}
+
 Configuration& Application::GetConfig()
 {
     return m_config;
@@ -161,6 +188,21 @@ void Application::ExecuteCustomOption(
                 castedOption->onParse(true);
             }
             break;
+        case CommandLineOptionType::Integer:
+            {
+                std::shared_ptr<CommandLineOptionValue<int>> castedOption =
+                    std::dynamic_pointer_cast<CommandLineOptionValue<int>>(
+                        opt
+                    );
+                if (castedOption == nullptr)
+                {
+                    throw std::runtime_error(
+                        "Could not cast cli arg to the correct type."
+                    );
+                }
+                castedOption->onParse(std::stoi(value));
+            }
+            break;
         case CommandLineOptionType::String:
             {
                 std::shared_ptr<CommandLineOptionValue<std::string>>
@@ -203,15 +245,16 @@ Filter& Application::GetFilter()
     return m_filter;
 }
 
-void Application::Init(int argc, char* argv[])
+bool Application::Init(int argc, char* argv[])
 {
     ParseArgs(argc, argv);
-    if (!m_canRun) return;
+    if (!m_canRun) return false;
     // Read config, setup db
     m_config.Open(m_configFile);
     m_db.MaxTreeDepth(m_config.MaxTreeDepth());
     m_db.Open(m_config.DatabasePath());
     m_filter.Init();
+    return true;
 }
 
 void Application::ParseArgs(int argc, char* argv[])
diff --git a/src/Configuration.cpp b/src/Configuration.cpp
index cc21231..65a90ba 100644
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@@ -17,8 +17,10 @@
 
 #include <filesystem>
 #include <fstream>
+#include <regex>
 #include <string>
 
+#include "usenetsearch/ScopeExit.h"
 #include "usenetsearch/StringUtils.h"
 
 #include "usenetsearch/Configuration.h"
@@ -35,12 +37,22 @@ std::filesystem::path Configuration::DatabasePath() const
     return m_databasePath;
 }
 
-std::vector<std::string> Configuration::FilterEraseSubtoken() const
+std::vector<std::string>& Configuration::FilterEraseSubtoken()
 {
     return m_filterEraseSubtoken;
 }
 
-std::vector<std::string> Configuration::FilterWordsNoSubtoken() const
+std::vector<std::regex>& Configuration::FilterNewsgroupBlacklist()
+{
+    return m_filterNewsgroupBlacklist;
+}
+
+std::vector<std::regex>& Configuration::FilterNewsgroupWhitelist()
+{
+    return m_filterNewsgroupWhitelist;
+}
+
+std::vector<std::string>& Configuration::FilterWordsNoSubtoken()
 {
     return m_filterWordsNoSubtoken;
 }
@@ -95,6 +107,7 @@ void Configuration::Open(const std::string& filename)
             "Could not open configuration file: " + filename
         );
     }
+    ScopeExit finCloser([&fin](){ fin.close(); });
     int line_nr = 0;
     while(std::getline(fin,line))
     {
@@ -108,7 +121,6 @@ void Configuration::Open(const std::string& filename)
         const auto kvp = StringSplit(line, std::string{":"}, 2);
         if (kvp.size() != 2)
         {
-            fin.close();
             throw ConfigurationException(EINVAL,
                 std::string("Invalid configuration in ")
                 + filename + std::string(" line ")
@@ -137,6 +149,40 @@ void Configuration::Open(const std::string& filename)
                 }
             }
         }
+        else if (key == "filter_newsgroup_blacklist")
+        {
+            try 
+            {
+                std::regex re(value);
+                m_filterNewsgroupBlacklist.emplace_back(re);
+            }
+            catch (const std::regex_error& e)
+            {
+                throw ConfigurationException(EINVAL,
+                    std::string("Invalid configuration in ")
+                    + filename + std::string(" line ")
+                    + std::to_string(line_nr) + " : Regular expression \""
+                    + value + "\" did not parse: " + e.what()
+                );
+            }
+        }
+        else if (key == "filter_newsgroup_whitelist")
+        {
+            try 
+            {
+                std::regex re(value);
+                m_filterNewsgroupWhitelist.emplace_back(re);
+            }
+            catch (const std::regex_error& e)
+            {
+                throw ConfigurationException(EINVAL,
+                    std::string("Invalid configuration in ")
+                    + filename + std::string(" line ")
+                    + std::to_string(line_nr) + " : Regular expression \""
+                    + value + "\" did not parse: " + e.what()
+                );
+            }
+        }
         else if (key == "filter_no_subtoken")
         {
             const auto tokens = StringSplit(value, std::string{","});
@@ -195,7 +241,6 @@ void Configuration::Open(const std::string& filename)
         }
         else
         {
-            fin.close();
             throw ConfigurationException(EINVAL,
                 std::string("Invalid configuration in ")
                 + filename + std::string(" line ")
@@ -203,7 +248,6 @@ void Configuration::Open(const std::string& filename)
             );
         }
     }
-    fin.close();
 }
 
 } // namespace usenetsearch
diff --git a/src/Database.cpp b/src/Database.cpp
index 0772505..f57c41f 100644
--- a/src/Database.cpp
+++ b/src/Database.cpp
@@ -24,6 +24,7 @@
 #include <thread>
 #include <vector>
 
+#include "usenetsearch/Application.h"
 #include "usenetsearch/StringUtils.h"
 #include "usenetsearch/UsenetClient.h"
 #include "usenetsearch/ScopeExit.h"
@@ -35,7 +36,7 @@ namespace usenetsearch {
 
 // Database class --------------------------------------------------------------
 
-Database::Database(Filter& filter): m_filter(filter)
+Database::Database(Application& app): m_app(app)
 {
 }
 
@@ -44,15 +45,41 @@ Database::~Database()
     m_newsGroupFileIO.Close();
 }
 
-bool Database::GetArticleEntry(
-    const std::string& subToken,
-    const std::string& searchString,
-    ArticleEntry& entry,
-    size_t& startPosition,
-    size_t& endPosition,
-    size_t& count)
+std::unique_ptr<NntpListEntry> Database::FindNntpEntry(
+    const std::string& subject)
 {
-    return false;
+    OpenNewsGroupFile();
+    ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
+    const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64();
+    std::unique_ptr<NntpListEntry> result = nullptr;
+    for (std::uint64_t n = 0; n != numGroups; ++n)
+    {
+        NntpListEntry entry;
+        m_newsGroupFileIO >> entry;
+        if (entry.name == subject)
+        {
+            result = std::make_unique<NntpListEntry>(entry);
+            break;
+        }
+    }
+    return result;
+}
+
+std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID)
+{
+    OpenNewsGroupFile();
+    ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
+    const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64();
+    for (std::uint64_t n = 0; n != numGroups; ++n)
+    {
+        NntpListEntry entry;
+        m_newsGroupFileIO >> entry;
+        if (entry.id == newsgroupID)
+        {
+            return entry.lastIndexedArticle;
+        }
+    }
+    return 0;
 }
 
 std::filesystem::path Database::GetTokenFilePath(
@@ -77,29 +104,29 @@ std::filesystem::path Database::GetTokenFilePath(
     return groupPath / groupFile;
 }
 
+std::uint64_t Database::GetUniqueNntpEntryId(
+        const std::vector<NntpListEntry>& list) const
+{
+    std::uint64_t result{0};
+    for (auto& entry: list)
+    {
+        if (result <= entry.id)
+        {
+            result = entry.id + 1;
+        }
+    }
+    return result;
+}
+
 void Database::MaxTreeDepth(std::uint8_t depth)
 {
     m_maxTreeDepth = depth;
 }
 
-std::unique_ptr<std::vector<NntpHeader>> Database::LoadArticleList(
-    const std::wstring& newsgroup)
-{
-
-}
-
 std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
 {
     OpenNewsGroupFile();
-    const std::uint64_t dbVersion = m_newsGroupFileIO.ReadInt64();
-    if (dbVersion != m_databaseVersion)
-    {
-        throw DatabaseException(EINVAL,
-            "The loaded database version (" + std::to_string(dbVersion) 
-            + ") does not match the current database version ("
-            + std::to_string(m_databaseVersion) + ")");
-    }
-
+    ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
     const size_t newsGroupCount = m_newsGroupFileIO.ReadInt64();
 
     auto result = std::make_unique<std::vector<NntpListEntry>>();
@@ -119,18 +146,37 @@ void Database::Open(std::filesystem::path dbPath)
     {
         std::filesystem::create_directory(dbPath);
     }
-    OpenNewsGroupFile();
 }
 
 void Database::OpenNewsGroupFile()
 {
     if (m_newsGroupFileIO.IsOpen())
     {
+        m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg);
         return;
     }
     const std::filesystem::path newsGroupFilePath =
         m_databasePath / "newsgroups.db";
+    const bool exists = std::filesystem::exists(newsGroupFilePath);
     m_newsGroupFileIO.Open(newsGroupFilePath);
+    if (exists)
+    {
+        const std::uint64_t ver = m_newsGroupFileIO.ReadInt64();
+        if (ver != m_databaseVersion)
+        {
+            throw DatabaseException(EBADF,
+                std::string{"Mismatching newgroup file database version:"}
+                + " have: " + std::to_string(ver) + " - want: "
+                + std::to_string(m_databaseVersion)
+            );
+        }
+    }
+    else
+    {
+        m_newsGroupFileIO << m_databaseVersion;
+        m_newsGroupFileIO << std::uint64_t{0}; // newsgroup count.
+        m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg);
+    }
 }
 
 void Database::ParseTokenFile(
@@ -155,26 +201,30 @@ void Database::ParseTokenFile(
     }
 }
 
-void Database::UpdateArticleList(
-    const std::wstring& newsgroup,
-    const std::vector<NntpHeader>& headers)
+void Database::SetLastIndexedArticle(
+    std::uint64_t newsgroupID,
+    std::int32_t articleID)
 {
-
-}
-
-void Database::UpdateNewsgroupList(const std::vector<NntpListEntry>& list)
-{
-    OpenNewsGroupFile();
-
-    m_newsGroupFileIO << m_databaseVersion;
-
-    const std::uint64_t newsGroupCount = list.size();
-    m_newsGroupFileIO << newsGroupCount;
-
-    for (const auto& entry: list)
+    auto outItems = LoadNewsgroupList();
+    bool found{false};
+    if (outItems)
     {
-        m_newsGroupFileIO << entry;
+        for (auto& entry: *outItems)
+        {
+            if (entry.id == newsgroupID)
+            {
+                entry.lastIndexedArticle = articleID;
+                found = true;
+            }
+        }
     }
+    if (!found)
+    {
+        throw DatabaseException(EINVAL,
+            "Attempt to update newsgroup not found in database - id: "
+            + std::to_string(newsgroupID));
+    }
+    UpdateNewsgroupList(*outItems);
 }
 
 void Database::SaveSearchTokens(
@@ -188,7 +238,10 @@ void Database::SaveSearchTokens(
         " ",
         m_maxTreeDepth,
         [&](const std::string& subToken, const std::string& str){
-            const std::string tok = m_filter.ProcessToken(subToken, str);
+            const std::string tok = m_app.GetFilter().ProcessToken(
+                subToken,
+                str
+            );
             if (tok.empty()) return;
             SaveToken(tok, newsgroupID, articleID);
         }
@@ -311,4 +364,52 @@ std::unique_ptr<std::vector<ArticleEntry>> Database::Search(
     return result;
 }
 
+void Database::UpdateNewsgroupList(std::vector<NntpListEntry>& list)
+{
+    if (list.size() == 0) return;
+
+    auto outList = LoadNewsgroupList();
+    for (auto& entry: list)
+    {
+        NntpListEntry newEntry(entry);
+        bool found{false};
+        if (outList)
+        {
+            std::for_each(
+                outList->begin(),
+                outList->end(),
+                [&entry, &found](NntpListEntry& oldEntry)
+                {
+                    if (oldEntry.name == entry.name)
+                    {
+                        // update existing (copy everything but ID & name)
+                        found = true;
+                        oldEntry.count = entry.count;
+                        oldEntry.high = entry.high;
+                        oldEntry.lastIndexedArticle = entry.lastIndexedArticle;
+                        oldEntry.low = entry.low;
+                        oldEntry.status = entry.status;
+                    }
+                }
+            );
+        }
+        if (found) continue;
+        // add new.
+        newEntry.id = GetUniqueNntpEntryId(*outList);
+        outList->emplace_back(newEntry);
+        entry.id = newEntry.id;
+    }
+    OpenNewsGroupFile();
+    ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
+    m_newsGroupFileIO << std::uint64_t{outList->size()};
+    std::for_each(
+        outList->begin(),
+        outList->end(),
+        [&](const NntpListEntry& e)
+        {
+            m_newsGroupFileIO << e;
+        }
+    );
+}
+
 } // namespace usenetsearch
diff --git a/src/Filter.cpp b/src/Filter.cpp
index 9f57dbb..5b1317b 100644
--- a/src/Filter.cpp
+++ b/src/Filter.cpp
@@ -36,7 +36,7 @@ void Filter::Init()
     // Pre-compile regexes for all the subtokens that should be erased.
     std::for_each(eraseTokens.begin(), eraseTokens.end(),
         [&](const std::string& tok){
-            const std::wstring wtok = m_conv.from_bytes(tok);
+            const std::wstring wtok = WideStringFromString(tok);
             m_eraseTokenRegexes.emplace(
                 std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
                 std::wstring{}
@@ -53,17 +53,34 @@ void Filter::Init()
     );
 }
 
-std::string Filter::ProcessSearchString(const std::string& searchString)
+bool Filter::ProcessNewsgroup(const std::string& newsgroup) const
 {
-    std::wstring str;
-    try 
+    for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())
     {
-        str = m_conv.from_bytes(searchString);
+        std::smatch matches;
+        if (std::regex_match(newsgroup, matches, blackRe))
+        {
+            if (matches.size() > 0) return false;
+        }
     }
-    catch (const std::range_error&)
+    if (m_config.FilterNewsgroupWhitelist().size() > 0)
     {
-        return ""; // string is not valid utf8
+        for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())
+        {
+            std::smatch matches;
+            if (std::regex_match(newsgroup, matches, whiteRe))
+            {
+                if (matches.size() > 0) return true;
+            }
+        }
+        return false;
     }
+    return true;
+}
+
+std::string Filter::ProcessSearchString(const std::string& searchString) const
+{
+    std::wstring str = WideStringFromString(searchString);
     std::remove_if(str.begin(), str.end(), [](wchar_t c){
         // Remove control characters.
         if (c < 0x20) return true; // ascii control chars
@@ -89,21 +106,12 @@ std::string Filter::ProcessSearchString(const std::string& searchString)
     // Convert strings that are ONLY whitespace to blank strings.
     static std::wregex rxAllWhitespace(L"^\\s+$");
     str = std::regex_replace(str, rxAllWhitespace, L"");
-    std::string result;
-    try 
-    {
-        result = m_conv.to_bytes(str); 
-    }
-    catch (const std::range_error&)
-    {
-        return "";
-    }
-    return result;
+    return StringFromWideString(str); 
 }
 
 std::string Filter::ProcessToken(
     const std::string& token,
-    const std::string& searchString)
+    const std::string& searchString) const
 {
     std::string result = token;
     // Process the nosubtokens list.
diff --git a/src/Indexer.cpp b/src/Indexer.cpp
index 3f390f7..dcfbb07 100644
--- a/src/Indexer.cpp
+++ b/src/Indexer.cpp
@@ -23,6 +23,92 @@
 
 namespace usenetsearch {
 
+// SearchResult class ----------------------------------------------------------
+
+SearchResult::SearchResult(const ArticleEntry& entry)
+{
+    m_newsgroupId = entry.newsgroupID;
+    m_articleId = entry.articleID;
+}
+
+SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId):
+    m_newsgroupId(newsgroupId), m_articleId(articleId)
+{
+
+}
+
+SearchResult::SearchResult(const SearchResult& other)
+{
+    m_articleId = other.m_articleId;
+    m_newsgroupId = other.m_newsgroupId;
+    m_numHits = other.m_numHits;
+};
+
+std::uint32_t SearchResult::ArticleId() const
+{
+    return m_articleId;
+}
+
+size_t SearchResult::Hits() const
+{
+    return m_numHits;
+}
+
+void SearchResult::Inc()
+{
+    m_numHits++;
+}
+
+std::uint32_t SearchResult::NewsgroupId() const
+{
+    return m_newsgroupId;
+}
+
+void SearchResult::operator=(const SearchResult& other)
+{
+    m_articleId = other.m_articleId;
+    m_newsgroupId = other.m_newsgroupId;
+    m_numHits = other.m_numHits;
+}
+
+bool SearchResult::operator==(const SearchResult& other) const
+{
+    const bool result =
+        (m_articleId == other.m_articleId)
+        && (m_newsgroupId == other.m_newsgroupId);
+    return result;
+}
+
+bool SearchResult::operator!=(const SearchResult& other) const
+{
+    return (
+        (m_articleId != other.m_articleId)
+        || (m_newsgroupId != other.m_newsgroupId)
+    );
+}
+
+bool SearchResult::operator<(const SearchResult& other) const
+{
+    return m_numHits < other.m_numHits;
+}
+
+bool SearchResult::operator>(const SearchResult& other) const
+{
+    return m_numHits > other.m_numHits;
+}
+
+bool SearchResult::operator>=(const SearchResult& other) const
+{
+    return m_numHits >= other.m_numHits;
+}
+
+bool SearchResult::operator<=(const SearchResult& other) const
+{
+    return m_numHits <= other.m_numHits;
+}
+
+// Indexer class ---------------------------------------------------------------
+
 Indexer::Indexer(Application& app, UsenetClient& client)
     : m_app(app), m_client(client)
 {
@@ -59,20 +145,41 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
                     << "(.=" << batchSize << " headers)." << std::endl;
         std::cout.flush();
         std::atomic<std::uint64_t> headerCount{0};
+        const std::atomic<std::uint64_t> groupID = group.id;
         std::reference_wrapper<Database> dbref = std::ref(m_app.GetDb());
-        m_client.ProcessHeaders(0,
-            [this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
-                m_threads.Queue([this, headers, &headerCount, &dbref](){
+        const std::uint32_t startMessage = dbref.get().GetLastIndexedArticle(
+            groupID
+        );
+        std::cout << "Indexing starting at message: " 
+                  << std::to_string(startMessage) << std::endl;
+        m_client.ProcessHeaders(startMessage,
+            [this, &headerCount, &dbref, &groupID](std::shared_ptr<NntpHeaders> headers){
+                m_threads.Queue([this, headers, &headerCount, &dbref, &groupID](){
+                    std::uint64_t lastArticle{0};
                     for (const auto& header: *headers)
                     {
-                        const std::uint64_t id{header.articleID};
+                        const std::uint64_t articleID{header.articleID};
                         std::string subject = header.subject;
                         subject = m_app.GetFilter().ProcessSearchString(
                             subject
                         );
                         if (subject == "") continue;
-                        dbref.get().SaveSearchTokens(1, id, subject);
+                        dbref.get().SaveSearchTokens(
+                            groupID,
+                            articleID,
+                            subject
+                        );
                         headerCount++;
+                        if (articleID > lastArticle) lastArticle = articleID;
+                    }
+                    // Update last-indexed id for the newsgroup.
+                    const std::uint32_t lastIndexedID =
+                        dbref.get().GetLastIndexedArticle(groupID);
+                    if (lastIndexedID < lastArticle)
+                    {
+                        dbref.get().SetLastIndexedArticle(
+                            groupID, lastArticle
+                        );
                     }
                     std::cout << ".";
                     std::cout.flush();
@@ -87,4 +194,39 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
     }
 }
 
+std::unique_ptr<SearchResults> Indexer::Search(
+    const std::string& searchString)
+{
+    auto result = std::make_unique<SearchResults>();
+    const std::string sstr = m_app.GetFilter().ProcessSearchString(
+        searchString
+    );
+    auto searchResults = m_app.GetDb().Search(sstr);
+    if (!searchResults) return result;
+    for(const ArticleEntry& entry: *searchResults)
+    {
+        SearchResult sr(entry);
+        // Check if a matching entry already exists in the result set, if so,
+        // increment count. Otherwise, append a new entry.
+        auto it = std::find(result->begin(), result->end(), sr);
+        if (it != result->end())
+        {
+            (*it).Inc();
+        }
+        else
+        {
+            sr.Inc();
+            result->emplace_back(sr);
+        }
+    }
+    std::sort(
+        result->begin(),
+        result->end(),
+        [](const SearchResult& a, const SearchResult& b){
+            return a.Hits() > b.Hits();
+        }
+    );
+    return result;
+}
+
 } // namespace usenetsearch
diff --git a/src/Serialize.cpp b/src/Serialize.cpp
index 584de90..cfeac42 100644
--- a/src/Serialize.cpp
+++ b/src/Serialize.cpp
@@ -170,7 +170,7 @@ std::string SerializableFile::ReadStr(size_t size) const
         RangeUnlock(startPos, size);
     });
     size_t bytesRead{0};
-    std::string result(size + 1, '\0');
+    std::string result(size, '\0');
     while (bytesRead < size)
     {
         const auto readNow = read(m_fd, &result[0], size);
@@ -485,21 +485,49 @@ SerializableFile& operator>>(SerializableFile& in, NntpHeader& obj)
 
 SerializableFile& operator<<(SerializableFile& out, const NntpListEntry& obj)
 {
+    out.Write(std::uint8_t{1}); // start of heading
+    out.Write(std::uint8_t{2}); // start of text
+    out << obj.id;
+    out << obj.lastIndexedArticle;
     out << obj.count;
     out << obj.high;
     out << obj.low;
     out << obj.name;
     out << obj.status;
+    out.Write(std::uint8_t{3}); // end of text
+    out.Write(std::uint8_t{4}); // end of transmission
     return out;
 }
 
 SerializableFile& operator>>(SerializableFile& in, NntpListEntry& obj)
 {
+    std::uint8_t SOH{};
+    std::uint8_t STX{};
+    std::uint8_t ETX{};
+    std::uint8_t EOT{};
+    in >> SOH;
+    in >> STX;
+    if ((SOH != 1) || (STX != 2))
+    {
+        throw SerializeException(EBADMSG,
+            "Bad magic number in NNTP entry header."
+        );
+    }
+    in >> obj.id;
+    in >> obj.lastIndexedArticle;
     in >> obj.count;
     in >> obj.high;
     in >> obj.low;
     in >> obj.name;
     in >> obj.status;
+    in >> ETX;
+    in >> EOT;
+    if ((ETX != 3) || (EOT != 4))
+    {
+        throw SerializeException(EBADMSG,
+            "Bad magic number in NNTP entry footer."
+        );
+    }
     return in;
 }
 
diff --git a/src/StringUtils.cpp b/src/StringUtils.cpp
index 46fa8e2..a323da7 100644
--- a/src/StringUtils.cpp
+++ b/src/StringUtils.cpp
@@ -30,6 +30,31 @@
 
 namespace usenetsearch {
 
+static std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
+
+std::string CharToHex(const char c)
+{
+    const int val = c;
+    std::ostringstream result;
+    result << std::setw(2) << std::setfill('0') << std::hex;
+    result << val;
+    return result.str();
+}
+
+std::string StringFromWideString(const std::wstring& input)
+{
+    std::string result;
+    try
+    {
+        result = conv.to_bytes(input);
+    }
+    catch(const std::range_error&)
+    {
+        return "";
+    }
+    return result;
+}
+
 std::string StringHash(const std::string& input)
 {
     unsigned char result[MD5_DIGEST_LENGTH];
@@ -123,13 +148,18 @@ void StringTreeOperation(
     }
 }
 
-std::string CharToHex(const char c)
+std::wstring WideStringFromString(const std::string& input)
 {
-    const int val = c;
-    std::ostringstream result;
-    result << std::setw(2) << std::setfill('0') << std::hex;
-    result << val;
-    return result.str();
+    std::wstring result;
+    try
+    {
+        result = conv.from_bytes(input);
+    }
+    catch(const std::range_error&)
+    {
+        return L"";
+    }
+    return result;
 }
 
 } // namespace usenetsearch
diff --git a/src/UsenetClient.cpp b/src/UsenetClient.cpp
index 811b6a2..b2abc56 100644
--- a/src/UsenetClient.cpp
+++ b/src/UsenetClient.cpp
@@ -22,6 +22,7 @@
 #include <mutex>
 #include <string>
 
+#include "usenetsearch/Application.h"
 #include "usenetsearch/Except.h"
 #include "usenetsearch/StringUtils.h"
 
@@ -31,6 +32,10 @@ namespace usenetsearch {
 
 // UsenetClient class ----------------------------------------------------------
 
+UsenetClient::UsenetClient(Application& app): m_app(app)
+{
+}
+
 void UsenetClient::Authenticate(
     const std::wstring& user,
     const std::wstring& password)
@@ -105,7 +110,7 @@ void UsenetClient::Group(const std::wstring& newsgroup)
     {
         throw UsenetClientException(
             response.code,
-            "Error changing group to " + m_conv.to_bytes(newsgroup) + " : "
+            "Error changing group to " + StringFromWideString(newsgroup) + " : "
             + response.message
         );
     }
@@ -145,6 +150,86 @@ NntpHeader UsenetClient::Head(std::uint64_t articleID)
     return result;
 }
 
+bool UsenetClient::IsError(const NntpMessage& msg) const
+{
+    if (msg.code >= 400) return true;
+    return false;
+}
+
+std::unique_ptr<std::vector<NntpListEntry>> UsenetClient::List()
+{
+    Write(L"LIST COUNTS\r\n");
+    /* In response, we should get a 215 response followed by the list of news
+       groups ending in a period on it's own line. */
+    const auto response = ReadLine();
+    if (IsError(response))
+    {
+        throw UsenetClientException(
+            response.code,
+            "Failed to fetch newsgroup list from server, "
+            + std::string{"server responded with: "} 
+            + response.message
+        );
+    }
+    const auto listStr = ReadUntil("\r\n.\r\n");
+    // parse the list.
+    auto lines = StringSplit(listStr, std::string{"\r\n"});
+    auto result = std::make_unique<std::vector<NntpListEntry>>();
+    if (lines.empty()) return result;
+    for (const auto& line: lines)
+    {
+        NntpListEntry entry;
+        const auto fields = StringSplit(line, std::string{" "});
+        if (fields.size() == 5)
+        {
+            entry.name = fields[0];
+            entry.high = std::stoul(fields[1]);
+            entry.low = std::stoul(fields[2]);
+            entry.count = std::stoul(fields[3]);
+            entry.status = fields[4];
+            entry.id = 0; // incremented by db when saving.
+            entry.lastIndexedArticle = 0;
+            if (m_app.GetFilter().ProcessNewsgroup(entry.name))
+            {
+                result->emplace_back(entry);
+            }
+        }
+    }
+    return result;
+}
+
+std::unique_ptr<std::vector<std::uint64_t>> UsenetClient::ListGroup(
+    const std::wstring& newsGroup)
+{
+    auto result = std::make_unique<std::vector<std::uint64_t>>();
+    if (!m_app.GetFilter().ProcessNewsgroup(StringFromWideString(newsGroup)))
+    {
+        return result;
+    }
+    Write(L"LISTGROUP " + newsGroup + L"\r\n");
+    /* In response, we should get a 211 response followed by the list of
+       article ID's ending in a period on it's own line. */
+    const auto response = ReadLine();
+    if (IsError(response))
+    {
+        throw UsenetClientException(
+            response.code,
+            "Failed to fetch newsgroup list from server, "
+            + std::string{"server responded with: "} 
+            + response.message
+        );
+    }
+    const auto listStr = ReadUntil("\r\n.\r\n");
+    // parse the list.
+    auto lines = StringSplit(listStr, std::string{"\r\n"});
+    if (lines.empty()) return result;
+    for (const auto& line: lines)
+    {
+        result->emplace_back(stoul(StringTrim(line)));
+    }
+    return result;
+}
+
 void UsenetClient::ProcessHeaders(
     std::uint64_t startMessage,
     std::function<void(std::shared_ptr<NntpHeaders>)> processFn,
@@ -201,77 +286,6 @@ void UsenetClient::ProcessHeaders(
     }
 }
 
-bool UsenetClient::IsError(const NntpMessage& msg) const
-{
-    if (msg.code >= 400) return true;
-    return false;
-}
-
-std::unique_ptr<std::vector<NntpListEntry>> UsenetClient::List()
-{
-    Write(L"LIST COUNTS\r\n");
-    /* In response, we should get a 215 response followed by the list of news
-       groups ending in a period on it's own line. */
-    const auto response = ReadLine();
-    if (IsError(response))
-    {
-        throw UsenetClientException(
-            response.code,
-            "Failed to fetch newsgroup list from server, "
-            + std::string{"server responded with: "} 
-            + response.message
-        );
-    }
-    const auto listStr = ReadUntil("\r\n.\r\n");
-    // parse the list.
-    auto lines = StringSplit(listStr, std::string{"\r\n"});
-    auto result = std::make_unique<std::vector<NntpListEntry>>();
-    if (lines.empty()) return result;
-    for (const auto& line: lines)
-    {
-        NntpListEntry entry;
-        const auto fields = StringSplit(line, std::string{" "});
-        if (fields.size() == 5)
-        {
-            entry.name = fields[0];
-            entry.high = std::stoul(fields[1]);
-            entry.low = std::stoul(fields[2]);
-            entry.count = std::stoul(fields[3]);
-            entry.status = fields[4];
-            result->emplace_back(entry);
-        }
-    }
-    return result;
-}
-
-std::unique_ptr<std::vector<std::uint64_t>> UsenetClient::ListGroup(
-    const std::wstring& newsGroup)
-{
-    Write(L"LISTGROUP " + newsGroup + L"\r\n");
-    /* In response, we should get a 211 response followed by the list of
-       article ID's ending in a period on it's own line. */
-    const auto response = ReadLine();
-    if (IsError(response))
-    {
-        throw UsenetClientException(
-            response.code,
-            "Failed to fetch newsgroup list from server, "
-            + std::string{"server responded with: "} 
-            + response.message
-        );
-    }
-    const auto listStr = ReadUntil("\r\n.\r\n");
-    // parse the list.
-    auto lines = StringSplit(listStr, std::string{"\r\n"});
-    auto result = std::make_unique<std::vector<std::uint64_t>>();
-    if (lines.empty()) return result;
-    for (const auto& line: lines)
-    {
-        result->emplace_back(stoul(StringTrim(line)));
-    }
-    return result;
-}
-
 NntpMessage UsenetClient::ReadLine()
 {
     NntpMessage result{};
@@ -306,7 +320,7 @@ std::string UsenetClient::ReadUntil(const std::string& deliminator)
 
 void UsenetClient::Write(const std::wstring& message)
 {
-    const std::string toSend = m_conv.to_bytes(message);
+    const std::string toSend = StringFromWideString(message);
     if (m_useSSL)
     {
         m_ssl->Write(toSend);
diff --git a/src/dbdump.cpp b/src/dbdump.cpp
new file mode 100644
index 0000000..f82565b
--- /dev/null
+++ b/src/dbdump.cpp
@@ -0,0 +1,82 @@
+/*
+    Copyright© 2021 John Sennesael
+
+    UsenetSearch is Free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    UsenetSearch is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with UsenetSearch.  If not, see <https://www.gnu.org/licenses/>.
+*/
+
+#include <iostream>
+#include <string>
+
+#include "usenetsearch/Application.h"
+#include "usenetsearch/StringUtils.h"
+
+using namespace usenetsearch;
+
+int main(int argc, char* argv[])
+{
+    Application app;
+    std::string tokenFile{""};
+    std::string newsgroupFile{""};
+    app.AddFileOption(
+        't',
+        "token db file to dump.",
+        [&tokenFile](const std::string& val)
+        {
+            tokenFile = val;
+        }
+    );
+    app.AddFileOption(
+        'n',
+        "newsgroup file to dump.",
+        [&newsgroupFile](const std::string& val)
+        {
+            newsgroupFile = val;
+        }
+    );
+    if (!app.Init(argc, argv)) return 1;
+    if (!tokenFile.empty())
+    {
+        app.GetDb().ParseTokenFile(tokenFile, [](const ArticleEntry& token){
+            std::cout << "Hash: " << HashBytesToString(token.hash) << " | "
+                    << "NewsgroupID: " << token.newsgroupID << " | "
+                    << "ArticleID: " << token.articleID << std::endl;
+        });
+    }
+    if (!newsgroupFile.empty())
+    {
+        const auto groups = app.GetDb().LoadNewsgroupList();
+        for(const auto& group: *groups)
+        {
+            std::cout << std::left
+                      << std::setw(9) << "Id: " + std::to_string(group.id)
+                      << std::setw(3) << " | "
+                      << std::setw(27) << "LastIndexedMsgId: " 
+                        + std::to_string(group.lastIndexedArticle)
+                      << std::setw(3) << " | "
+                      << std::setw(14) << "Count: " +
+                        std::to_string(group.count)
+                      << std::setw(3) << " | "
+                      << std::setw(13) << "High: " + std::to_string(group.high)
+                      << std::setw(3) << " | "
+                      << std::setw(8) << "Low: " + std::to_string(group.low)
+                      << std::setw(3) << " | "
+                      << std::setw(9) << "Status: " + group.status
+                      << std::setw(3) << " | "
+                      << std::setw(group.name.size() + 5)
+                      << "Name: " + group.name
+                      << std::endl;
+        }
+    }
+    return 0;
+}
diff --git a/src/tokendump.cpp b/src/tokendump.cpp
deleted file mode 100644
index 84afd25..0000000
--- a/src/tokendump.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
-    Copyright© 2021 John Sennesael
-
-    UsenetSearch is Free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    UsenetSearch is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with UsenetSearch.  If not, see <https://www.gnu.org/licenses/>.
-*/
-
-#include <iostream>
-#include <string>
-
-#include "usenetsearch/Application.h"
-#include "usenetsearch/StringUtils.h"
-
-using namespace usenetsearch;
-
-int main(int argc, char* argv[])
-{
-    Application app;
-    std::string dbFile{""};
-    app.AddFileOption(
-        'd',
-        "token db file to dump.",
-        [&dbFile](const std::string& val)
-        {
-            dbFile = val;
-        }
-    );
-    app.Init(argc, argv);
-    app.GetDb().ParseTokenFile(dbFile, [](const ArticleEntry& token){
-        std::cout << "Hash: " << HashBytesToString(token.hash) << " | "
-                  << "NewsgroupID: " << token.newsgroupID << " | "
-                  << "ArticleID: " << token.articleID << std::endl;
-    });
-    return 0;
-}
diff --git a/src/usenetfind.cpp b/src/usenetfind.cpp
index 7f4b8ec..c642bc8 100644
--- a/src/usenetfind.cpp
+++ b/src/usenetfind.cpp
@@ -1,6 +1,8 @@
 #include <iostream>
 
 #include "usenetsearch/Application.h"
+#include "usenetsearch/UsenetClient.h"
+#include "usenetsearch/Indexer.h"
 
 using namespace usenetsearch;
 
@@ -13,15 +15,41 @@ int main(int argc, char* argv[])
             searchString = s;
         }
     );
-    app.Init(argc, argv);
+    int maxResults{0};
+    app.AddIntegerOption('n', "Maximum results",
+        [&maxResults](int n){
+            maxResults = n;
+        }
+    );
+    if (!app.Init(argc, argv)) return 1;
     if (searchString.empty())
     {
         std::cerr << "Missing search string." << std::endl;
         app.Usage(argv[0]);
         return 1;
     }
-    searchString = app.GetFilter().ProcessSearchString(searchString);
-
-    auto searchResults = app.GetDb().Search(searchString);
+    UsenetClient client(app);
+    Indexer idx(app, client);
+    std::unique_ptr<SearchResults> results = idx.Search(
+        searchString
+    );
+    if (!results)
+    {
+        std::cout << "Nothing found." << std::endl;
+        return 0;
+    }
+    size_t resultCounter{0};
+    for (const auto& sr: *results)
+    {
+        std::cout << std::left 
+            << std::setw(18) << "Newsgroup id: " + std::to_string(sr.NewsgroupId())
+            << std::setw(4) << " | "
+            << std::setw(17) << "Article id: " + std::to_string(sr.ArticleId())
+            << std::setw(4) << " | "
+            << std::setw(10) << "Hits: " + std::to_string(sr.Hits())
+            << std::endl;
+        resultCounter++;
+        if ((maxResults > 0) && (resultCounter >= maxResults)) break;
+    }
     return 0;
 }
diff --git a/src/usenetindexd.cpp b/src/usenetindexd.cpp
index 08f176d..4233435 100644
--- a/src/usenetindexd.cpp
+++ b/src/usenetindexd.cpp
@@ -27,53 +27,24 @@ using namespace usenetsearch;
 int main(int argc, char* argv[])
 {
     Application app;
-    app.Init(argc, argv);
-
-    UsenetClient client;
+    if (!app.Init(argc, argv)) return 1;
 
+    UsenetClient client(app);
     Indexer indexer(app, client);
+    std::cout << "Connecting to newsgroup server...";
     indexer.Connect();
-
+    std::cout << "<OK>" << std::endl;
 
     try 
     {
-        // BEGIN TEMPORARY TEST CODE
-        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
-        std::unique_ptr<std::vector<NntpListEntry>> list;
-        NntpListEntry e{};
-        e.count = 100;
-
-// 1001 headers
-//        e.name = "comp.os.os2.comm";
-
-// 2541 headers
-//        e.name = "borland.public.cppbuilder.commandlinetools";
-
-// 100026 headers (1859952 K) (1816.35 M)
-//        e.name = "dk.videnskab";
-// 1000437 headers
-        e.name = "alt.bible";
-
-// a million or so, but this one is very slow because all subjects look the
-// same, so everything goes to the same token index, which means we're 
-// constantly waiting on a file lock.
-//        e.name = "usenetserver.test";
-
-        list = std::make_unique<std::vector<NntpListEntry>>();
-        list->emplace_back(e);
-        if ((list == nullptr) || (list->empty()))
-        {
-            std::cout << "Getting newsgroup list...";
-            std::cout.flush();
-            list = client.List();
-            app.GetDb().UpdateNewsgroupList(*list);
-            std::cout << "DONE." << std::endl;
-            std::cout.flush();
-        }
-        std::cout << "Number of newsgroups in newsgroup: "
-                  << list->size() << std::endl;
+        std::cout << "Getting newsgroup list...";
+        std::cout.flush();
+        auto list = client.List();
+        app.GetDb().UpdateNewsgroupList(*list);
+        std::cout << "<DONE>" << std::endl;
+        std::cout.flush();
+        std::cout << "Found " << list->size() << " newsgroups." << std::endl;
         std::cout.flush();
-        // END TEMPORARY TEST CODE
         indexer.Index(*list);
     }
     catch (const UsenetSearchException& e)
diff --git a/usenetsearch.example.conf b/usenetsearch.example.conf
index 9ba0b00..4130122 100644
--- a/usenetsearch.example.conf
+++ b/usenetsearch.example.conf
@@ -38,8 +38,30 @@ max_tree_depth: 10
 # The higher your tree max_tree_depth, the more likely you'll need to increase
 # this.
 
-max_threads: 8
-batch_size: 1000
+max_threads: 16
+batch_size: 10000
+
+#############################
+# Newsgroup filter settings #
+#############################
+
+# List one or more newsgroup regular expressions to include or exclude from
+# being indexed. Blacklisted patterns take precedence over whitelisted patterns.
+# These options may be repeated to include additional blacklist/whitelist
+# regular expressions.
+
+# If filter_newsgroup_whitelist is set, only newsgroups matching the configured
+# regular expressions will be included in indexing.
+# If not set, all of usenet will be indexed (with the exeption of
+# filter_newsgroup_blacklist groups)
+
+filter_newsgroup_whitelist: ^alt\.bible$
+filter_newsgroup_whitelist: ^borland\.public\.cppbuilder\.*
+
+# filter_newsgroup_blacklist allows you to exclude newsgroups from being
+# indexed, whether filter_newsgroup_whitlelist is set or not.
+
+filter_newsgroup_blacklist: .*binaries.*
 
 ########################
 # Word filter settings #
@@ -55,21 +77,21 @@ batch_size: 1000
 # List of strings is comma-separated and case-insensitive. Each subsequent
 # option appends to the previously defined list.
 
-filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and
-filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
-filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how
-filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
-filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
-filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
-filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes
-filter_erase_subtoken: yet,you,your
+# filter_erase_subtoken: the,by
 
 # This setting lets you list all tokens that will only be indexed on direct
 # (whole string) matches. Each token is comma-separated, and the configuration
 # option may be listed multiple times as well, each subsequent option appends to
 # the previously defined list. All tokens are case-insensitive.
 
-filter_no_subtoken: makes for,funny business
+filter_no_subtoken: a,about,actually,almost,also,although,always,am,an,and
+filter_no_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
+filter_no_subtoken: does,each,either,else,for,from,had,has,have,hence,how
+filter_no_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
+filter_no_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
+filter_no_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
+filter_no_subtoken: whoever,whose,why,will,with,within,without,would,yes
+filter_no_subtoken: yet,you,your
 
 # Sets the minimum number of words in a sub-token. You may use this if you don't
 # want to index single-words unless they are a direct match to the subject (in