Implement various filter options, some bugfixes.

2021-10-12 18:41:03 -05:00 · 2021-10-12 18:41:03 -05:00 · 8b2ac61264
parent e7619f5236
commit 8b2ac61264
15 changed files with 377 additions and 49 deletions
--- a/include/usenetsearch/Application.h
+++ b/include/usenetsearch/Application.h
@ -23,6 +23,7 @@

 #include "usenetsearch/Configuration.h"
 #include "usenetsearch/Database.h"
+#include "usenetsearch/Filter.h"

 namespace usenetsearch {

@ -54,13 +55,13 @@ class Application
    Configuration m_config;
    std::string m_configFile{"usenetsearch.conf"};
    Database m_db;
+    Filter m_filter;

    void ExecuteCustomOption(
-        const std::shared_ptr<CommandLineOption>&,
+        std::shared_ptr<CommandLineOption>&,
        const std::string& value=""
    );
    void ParseArgs(int argc, char* argv[]);
-    void Usage(const std::string& programName);

 public:

@ -77,9 +78,17 @@ public:
        std::function<void(std::filesystem::path)> onParse,
        std::filesystem::path defaultValue = "."
    );
-    Configuration& Config();
-    Database& Db();
+    void AddStringOption(
+        char option,
+        const std::string& help,
+        std::function<void(std::string)> onParse,
+        std::string defaultValue = ""
+    );
+    Configuration& GetConfig();
+    Database& GetDb();
+    Filter& GetFilter();
    void Init(int argc, char* argv[]);
+    void Usage(const std::string& programName);
 };

 } // namespace usenetsearch
--- a/include/usenetsearch/Configuration.h
+++ b/include/usenetsearch/Configuration.h
@ -35,8 +35,11 @@ struct ConfigurationException: public UsenetSearchException
 class Configuration
 {
    std::uint16_t m_batchSize{1};
+    std::vector<std::string> m_filterEraseSubtoken;
+    std::vector<std::string> m_filterWordsNoSubtoken;
    std::uint16_t m_maxThreads{1};
    std::uint8_t m_maxTreeDepth{5};
+    std::uint16_t m_minSubtokenWords{1};
    std::string m_nntpServerHost{"127.0.0.1"};
    std::string m_nntpServerPassword{"password"};
    int m_nntpServerPort{119};
@ -48,8 +51,11 @@ public:

    std::uint16_t BatchSize() const;
    std::filesystem::path DatabasePath() const;
+    std::vector<std::string> FilterEraseSubtoken() const;
+    std::vector<std::string> FilterWordsNoSubtoken() const;
    std::uint16_t MaxThreads() const;
    std::uint8_t MaxTreeDepth() const;
+    std::uint16_t MinSubtokenWords() const;
    std::string NNTPServerHost() const;
    std::string NNTPServerPassword() const;
    int NNTPServerPort() const;
--- a/include/usenetsearch/Database.h
+++ b/include/usenetsearch/Database.h
@ -26,6 +26,7 @@
 #include <mutex>
 #include <vector>

+#include "usenetsearch/Filter.h"
 #include "usenetsearch/Serialize.h"
 #include "usenetsearch/UsenetClient.h"

@ -57,6 +58,7 @@ class Database
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
    std::filesystem::path m_databasePath;
    std::uint64_t m_databaseVersion{DatabaseVersion};
+    Filter& m_filter;
    std::vector<std::filesystem::path> m_lockedFiles;
    std::mutex m_lockedFilesMutex;
    std::uint8_t m_maxTreeDepth{5};
@ -79,6 +81,10 @@ class Database
        std::uint64_t newsgroupID,
        std::uint32_t articleID
    );
+    std::unique_ptr<std::vector<ArticleEntry>> LoadTokens(
+        const std::filesystem::path dbFile,
+        const std::string& subtoken
+    );
    void OpenNewsGroupFile();
    void SaveToken(
        const std::string& subToken,
@ -88,6 +94,7 @@ class Database

 public:

+    explicit Database(Filter& filter);
    ~Database();
    std::unique_ptr<std::vector<NntpHeader>> LoadArticleList(
        const std::wstring& newsgroup
@ -99,16 +106,19 @@ public:
        const std::filesystem::path& dbFile,
        std::function<void(const ArticleEntry& entry)> onParse
    );
-    void UpdateArticleList(
-        const std::wstring& newsgroup,
-        const std::vector<NntpHeader>& headers
-    );
-    void UpdateNewsgroupList(const std::vector<NntpListEntry>& list);
    void SaveSearchTokens(
        std::uint64_t newsgroupID,
        std::uint64_t articleID,
        const std::string& searchString
    );
+    std::unique_ptr<std::vector<ArticleEntry>> Search(
+        const std::string& searchString
+    );
+    void UpdateArticleList(
+        const std::wstring& newsgroup,
+        const std::vector<NntpHeader>& headers
+    );
+    void UpdateNewsgroupList(const std::vector<NntpListEntry>& list);

 };

--- a/include/usenetsearch/Filter.h
+++ b/include/usenetsearch/Filter.h
@ -19,18 +19,32 @@

 #include <codecvt>
 #include <locale>
+#include <regex>
 #include <string>
+#include <unordered_map>
+
+#include "usenetsearch/Configuration.h"

 namespace usenetsearch {

 class Filter
 {

+    Configuration& m_config;
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
+    std::vector<std::string> m_noSubtokenWords;
+    std::unordered_map<std::unique_ptr<std::wregex>, std::wstring>
+        m_eraseTokenRegexes;

 public:

+    explicit Filter(Configuration& config);
+    void Init();
    std::string ProcessSearchString(const std::string& searchString);
+    std::string ProcessToken(
+        const std::string& token,
+        const std::string& searchString
+    );

 };

--- a/include/usenetsearch/Indexer.h
+++ b/include/usenetsearch/Indexer.h
@ -32,7 +32,6 @@ class Indexer
    Application& m_app;
    UsenetClient& m_client;
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
-    Filter m_filter;
    ThreadPool m_threads;

 public:
--- a/src/Application.cpp
+++ b/src/Application.cpp
@ -57,6 +57,22 @@ void Application::AddFileOption(
    m_commandLineArguments.emplace_back(std::move(val));
 }

+void Application::AddStringOption(
+    char option,
+    const std::string& help,
+    std::function<void(std::string)> onParse,
+    std::string defaultValue)
+{
+    auto val = std::make_shared<CommandLineOptionValue<
+        std::string>>();
+    val->type = CommandLineOptionType::String;
+    val->option = option;
+    val->helpText = help;
+    val->value = defaultValue;
+    val->onParse = onParse;
+    m_commandLineArguments.emplace_back(std::move(val));
+}
+
 void Application::Usage(const std::string& programName)
 {
    std::cout << "UsenetSearch - usenet search indexer" << std::endl;
@ -109,59 +125,67 @@ void Application::Usage(const std::string& programName)
    std::cout << std::endl;
 }

-Application::Application()
+Application::Application() : m_db(m_filter), m_filter(m_config)
 {
    std::cout.setf(std::ios::unitbuf);
 }

-Configuration& Application::Config()
+Configuration& Application::GetConfig()
 {
    return m_config;
 }

-Database& Application::Db()
+Database& Application::GetDb()
 {
    return m_db;
 }

 void Application::ExecuteCustomOption(
-    const std::shared_ptr<CommandLineOption>& opt,
+    std::shared_ptr<CommandLineOption>& opt,
    const std::string& value)
 {
    switch (opt->type)
    {
        case CommandLineOptionType::Boolean:
            {
-                const auto castedOption = std::dynamic_pointer_cast<
-                    std::shared_ptr<const CommandLineOptionValue<bool>>>(opt);
+                std::shared_ptr<CommandLineOptionValue<bool>> castedOption =
+                    std::dynamic_pointer_cast<CommandLineOptionValue<bool>>(
+                        opt
+                    );
                if (castedOption == nullptr)
                {
                    throw std::runtime_error(
                        "Could not cast cli arg to the correct type."
                    );
                }
-                castedOption->get()->onParse(true);
+                castedOption->onParse(true);
            }
            break;
        case CommandLineOptionType::String:
            {
-                const auto castedOption = std::dynamic_pointer_cast<
-                    std::shared_ptr<const CommandLineOptionValue<std::string>>>(
-                        opt);
+                std::shared_ptr<CommandLineOptionValue<std::string>>
+                castedOption = std::dynamic_pointer_cast<
+                    CommandLineOptionValue<std::string>>(
+                        opt
+                );
                if (castedOption == nullptr)
                {
                    throw std::runtime_error(
                        "Could not cast cli arg to the correct type."
                    );
                }
-                castedOption->get()->onParse(value);
+                castedOption->onParse(value);
            }
            break;
        case CommandLineOptionType::Path:
            {
-                const auto castedOption = std::dynamic_pointer_cast<
-                    CommandLineOptionValue<
-                        std::filesystem::path>>(opt);
+                std::shared_ptr<CommandLineOptionValue<
+                    std::filesystem::path>> castedOption =
+                        std::dynamic_pointer_cast<
+                            CommandLineOptionValue<
+                                std::filesystem::path
+                            >
+                        >(opt);
                if (castedOption == nullptr)
                {
                    throw std::runtime_error(
@ -174,6 +198,11 @@ void Application::ExecuteCustomOption(
    }
 }

+Filter& Application::GetFilter()
+{
+    return m_filter;
+}
+
 void Application::Init(int argc, char* argv[])
 {
    ParseArgs(argc, argv);
@ -182,6 +211,7 @@ void Application::Init(int argc, char* argv[])
    m_config.Open(m_configFile);
    m_db.MaxTreeDepth(m_config.MaxTreeDepth());
    m_db.Open(m_config.DatabasePath());
+    m_filter.Init();
 }

 void Application::ParseArgs(int argc, char* argv[])
@ -212,7 +242,7 @@ void Application::ParseArgs(int argc, char* argv[])
        {
            // Parse custom options.
            bool parsed{false};
-            for (const auto& optionValue: m_commandLineArguments)
+            for (auto optionValue: m_commandLineArguments)
            {
                if ((std::string{"-"} + optionValue->option) == curr_opt)
                {
--- a/src/Configuration.cpp
+++ b/src/Configuration.cpp
@ -35,6 +35,16 @@ std::filesystem::path Configuration::DatabasePath() const
    return m_databasePath;
 }

+std::vector<std::string> Configuration::FilterEraseSubtoken() const
+{
+    return m_filterEraseSubtoken;
+}
+
+std::vector<std::string> Configuration::FilterWordsNoSubtoken() const
+{
+    return m_filterWordsNoSubtoken;
+}
+
 std::uint16_t Configuration::MaxThreads() const
 {
    return m_maxThreads;
@ -45,6 +55,11 @@ std::uint8_t Configuration::MaxTreeDepth() const
    return m_maxTreeDepth;
 }

+std::uint16_t Configuration::MinSubtokenWords() const
+{
+    return m_minSubtokenWords;
+}
+
 std::string Configuration::NNTPServerHost() const
 {
    return m_nntpServerHost;
@ -110,6 +125,30 @@ void Configuration::Open(const std::string& filename)
        {
            m_databasePath = value;
        }
+        else if (key == "filter_erase_subtoken")
+        {
+            const auto tokens = StringSplit(value, std::string{","});
+            for (const auto& token: tokens)
+            {
+                const auto trimmedToken = StringToLower(StringTrim(token));
+                if (trimmedToken != "")
+                {
+                    m_filterEraseSubtoken.emplace_back(trimmedToken);
+                }
+            }
+        }
+        else if (key == "filter_no_subtoken")
+        {
+            const auto tokens = StringSplit(value, std::string{","});
+            for (const auto& token: tokens)
+            {
+                const auto trimmedToken = StringToLower(StringTrim(token));
+                if (trimmedToken != "")
+                {
+                    m_filterWordsNoSubtoken.emplace_back(trimmedToken);
+                }
+            }
+        }
        else if (key == "max_threads")
        {
            m_maxThreads = stoi(value);
@ -118,6 +157,10 @@ void Configuration::Open(const std::string& filename)
        {
            m_maxTreeDepth = stoi(value);
        }
+        else if (key == "minimum_subtoken_words")
+        {
+            m_minSubtokenWords = stoi(value);
+        }
        else if (key == "nntp_server_host")
        {
            m_nntpServerHost = value;
--- a/src/Database.cpp
+++ b/src/Database.cpp
@ -35,6 +35,10 @@ namespace usenetsearch {

 // Database class --------------------------------------------------------------

+Database::Database(Filter& filter): m_filter(filter)
+{
+}
+
 Database::~Database()
 {
    m_newsGroupFileIO.Close();
@ -184,7 +188,9 @@ void Database::SaveSearchTokens(
        " ",
        m_maxTreeDepth,
        [&](const std::string& subToken, const std::string& str){
-            SaveToken(subToken, newsgroupID, articleID);
+            const std::string tok = m_filter.ProcessToken(subToken, str);
+            if (tok.empty()) return;
+            SaveToken(tok, newsgroupID, articleID);
        }
    );
 }
@ -212,6 +218,26 @@ bool Database::HasToken(
    return false;
 }

+std::unique_ptr<std::vector<ArticleEntry>> Database::LoadTokens(
+    const std::filesystem::path dbFile,
+    const std::string& subtoken)
+{
+    auto result = std::make_unique<std::vector<ArticleEntry>>();
+    if (!std::filesystem::exists(dbFile)) return result;
+    SerializableFile io;
+    io.Open(dbFile);
+    const std::uint64_t tokenCount = io.ReadInt64();
+    const auto tokenHash = StringHashBytes(subtoken);
+    for (std::uint64_t ntok = 0; ntok != tokenCount; ++ntok)
+    {
+        ArticleEntry entry{};
+        io >> entry;
+        if (entry.hash != tokenHash) continue;
+        result->emplace_back(entry);
+    }
+    return result;
+}
+
 void Database::SaveToken(
    const std::string& subtoken,
    std::uint64_t newsgroupID,
@ -242,7 +268,47 @@ void Database::SaveToken(
        io << std::uint64_t{1};
    }
    // write out token.
+#if 0
+    std::cout << "Token: " << subtoken << std::endl;
+    std::cout << "Saving into file: " << path << std::endl;
+    std::cout << "Token hash: " << HashBytesToString(token.hash) << std::endl << std::endl;
+#endif
    io << token;
 }

+std::unique_ptr<std::vector<ArticleEntry>> Database::Search(
+    const std::string& searchString)
+{
+    auto result = std::make_unique<std::vector<ArticleEntry>>();
+    // Tokenize the search string.
+    std::vector<std::string> searchTokens;
+    StringTreeOperation(
+        searchString,
+        " ",
+        m_maxTreeDepth,
+        [&searchTokens](const std::string& subToken, const std::string&){
+            searchTokens.emplace_back(subToken);
+        }
+    );
+    for (const auto& searchToken: searchTokens)
+    {
+        const auto path = GetTokenFilePath(searchToken, false);
+        const bool exists = std::filesystem::exists(path);
+        if (!exists) continue;
+        const auto foundTokens = LoadTokens(path, searchToken);
+        if (foundTokens->empty()) continue;
+        result->insert(result->end(), foundTokens->begin(), foundTokens->end());
+        std::cout << std::left << std::setw(searchString.length() + 7)
+                  << "token: " + searchToken
+                  << std::setw(3) << " | "
+                  << std::setw(10)
+                  << "db file: " << path.string()
+                  << std::setw(3) << " | "
+                  << std::setw(9)
+                  << "#results: " + std::to_string(foundTokens->size())
+                  << std::endl; 
+    }
+    return result;
+}
+
 } // namespace usenetsearch
--- a/src/Filter.cpp
+++ b/src/Filter.cpp
@ -16,6 +16,7 @@
 */

 #include <algorithm>
+#include <iostream>
 #include <regex>

 #include "usenetsearch/StringUtils.h"
@ -24,6 +25,34 @@

 namespace usenetsearch {

+Filter::Filter(Configuration& config): m_config(config), m_eraseTokenRegexes{}
+{
+}
+
+void Filter::Init()
+{
+    m_noSubtokenWords = m_config.FilterWordsNoSubtoken();
+    const auto eraseTokens = m_config.FilterEraseSubtoken();
+    // Pre-compile regexes for all the subtokens that should be erased.
+    std::for_each(eraseTokens.begin(), eraseTokens.end(),
+        [&](const std::string& tok){
+            const std::wstring wtok = m_conv.from_bytes(tok);
+            m_eraseTokenRegexes.emplace(
+                std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
+                std::wstring{}
+            );
+            m_eraseTokenRegexes.emplace(
+                std::make_unique<std::wregex>(L"\\s+" + wtok + L"$"),
+                std::wstring{L""}
+            );
+            m_eraseTokenRegexes.emplace(
+                std::make_unique<std::wregex>(L"\\s+" + wtok + L"\\s+"),
+                std::wstring{L" "}
+            );
+        }
+    );
+}
+
 std::string Filter::ProcessSearchString(const std::string& searchString)
 {
    std::wstring str;
@ -44,8 +73,14 @@ std::string Filter::ProcessSearchString(const std::string& searchString)
    // Remove Re: for obvious reasons
    str = StringRemove(StringToLower(str), std::wstring{L"re:"});
    // Remove punctuation and stuff by converting to whitespace
-    static std::wregex rxPunctuation(L"[\\.!?#$%^&~*()+\\[\\]\"-<>]+");
+    static std::wregex rxPunctuation(L"[\\.!?#$%^&~*\\(\\)\\+\\[\\]\"\\-<>]+");
    str = std::regex_replace(str, rxPunctuation, L" ");
+    // Process erase subtoken list.
+    std::for_each(m_eraseTokenRegexes.begin(), m_eraseTokenRegexes.end(),
+        [&str](const auto& repl){
+            str = std::regex_replace(str, *repl.first, repl.second);
+        }
+    );
    // Convert repeated whitespace to just one space.
    static std::wregex rxWhitespaceMerge(L"\\s+");
    str = std::regex_replace(str, rxWhitespaceMerge, L" ");
@ -66,4 +101,31 @@ std::string Filter::ProcessSearchString(const std::string& searchString)
    return result;
 }

+std::string Filter::ProcessToken(
+    const std::string& token,
+    const std::string& searchString)
+{
+    std::string result = token;
+    // Process the nosubtokens list.
+    if (token != searchString)
+    {
+        if (std::find(
+                m_noSubtokenWords.begin(), m_noSubtokenWords.end(), result)
+            != m_noSubtokenWords.end()) return "";
+    }
+    // Process min subtoken word count.
+    const auto words = StringSplit(result, std::string{" "});
+    const std::uint16_t wordCount = words.size();
+    const std::uint16_t minWords = m_config.MinSubtokenWords();
+    if (minWords > 1)
+    {
+        if ((wordCount < minWords) && (token != searchString))
+        {
+            return "";
+        }
+    }
+    result = token;
+    return result;
+}
+
 } // namespace usenetsearch
--- a/src/Indexer.cpp
+++ b/src/Indexer.cpp
@ -26,19 +26,19 @@ namespace usenetsearch {
 Indexer::Indexer(Application& app, UsenetClient& client)
    : m_app(app), m_client(client)
 {
-    m_threads.MaxThreads(m_app.Config().MaxThreads());
+    m_threads.MaxThreads(m_app.GetConfig().MaxThreads());
 }

 void Indexer::Connect()
 {
    m_client.Connect(
-        m_app.Config().NNTPServerHost(),
-        m_app.Config().NNTPServerPort(),
-        m_app.Config().NNTPServerSSL()
+        m_app.GetConfig().NNTPServerHost(),
+        m_app.GetConfig().NNTPServerPort(),
+        m_app.GetConfig().NNTPServerSSL()
    );
    m_client.Authenticate(
-        m_conv.from_bytes(m_app.Config().NNTPServerUser()),
-        m_conv.from_bytes(m_app.Config().NNTPServerPassword())
+        m_conv.from_bytes(m_app.GetConfig().NNTPServerUser()),
+        m_conv.from_bytes(m_app.GetConfig().NNTPServerPassword())
    );
 }

@ -47,7 +47,7 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
    /**
     * @todo Replace all stdout stuff with Logger class.
     */
-    const size_t batchSize = m_app.Config().BatchSize();
+    const size_t batchSize = m_app.GetConfig().BatchSize();
    for (const auto& group: newsgroups)
    {
        const std::wstring newsgroup = m_conv.from_bytes(group.name);
@ -59,7 +59,7 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
                    << "(.=" << batchSize << " headers)." << std::endl;
        std::cout.flush();
        std::atomic<std::uint64_t> headerCount{0};
-        std::reference_wrapper<Database> dbref = std::ref(m_app.Db());
+        std::reference_wrapper<Database> dbref = std::ref(m_app.GetDb());
        m_client.ProcessHeaders(0,
            [this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
                m_threads.Queue([this, headers, &headerCount, &dbref](){
@ -67,7 +67,9 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
                    {
                        const std::uint64_t id{header.articleID};
                        std::string subject = header.subject;
-                        subject = m_filter.ProcessSearchString(subject);
+                        subject = m_app.GetFilter().ProcessSearchString(
+                            subject
+                        );
                        if (subject == "") continue;
                        dbref.get().SaveSearchTokens(1, id, subject);
                        headerCount++;
--- a/src/Serialize.cpp
+++ b/src/Serialize.cpp
@ -433,7 +433,7 @@ SerializableFile& operator<<(
    out.Write(std::uint8_t{2}); // start of text
    for (std::uint8_t i: obj.hash)
    {
-        out << obj.hash[i];
+        out.Write(i);
    }
    out << obj.newsgroupID;
    out << obj.articleID;
--- a/src/tokendump.cpp
+++ b/src/tokendump.cpp
@ -36,7 +36,7 @@ int main(int argc, char* argv[])
        }
    );
    app.Init(argc, argv);
-    app.Db().ParseTokenFile(dbFile, [](const ArticleEntry& token){
+    app.GetDb().ParseTokenFile(dbFile, [](const ArticleEntry& token){
        std::cout << "Hash: " << HashBytesToString(token.hash) << " | "
                  << "NewsgroupID: " << token.newsgroupID << " | "
                  << "ArticleID: " << token.articleID << std::endl;
--- a/src/usenetfind.cpp
+++ b/src/usenetfind.cpp
@ -1,5 +1,28 @@
+#include <iostream>
+
+#include "usenetsearch/Application.h"
+
+using namespace usenetsearch;

 int main(int argc, char* argv[])
 {
+    Application app;
+    std::string searchString;
+    app.AddStringOption('s', "Search string",
+        [&searchString](const std::string& s){
+            searchString = s;
+        }
+    );
+    app.Init(argc, argv);
+    if (searchString.empty())
+    {
+        std::cerr << "Missing search string." << std::endl;
+        app.Usage(argv[0]);
+        return 1;
+    }
+    searchString = app.GetFilter().ProcessSearchString(searchString);
+    std::cout << searchString << std::endl;
+    return 0;
+    auto searchResults = app.GetDb().Search(searchString);
    return 0;
 }
--- a/src/usenetindexd.cpp
+++ b/src/usenetindexd.cpp
@ -66,7 +66,7 @@ int main(int argc, char* argv[])
            std::cout << "Getting newsgroup list...";
            std::cout.flush();
            list = client.List();
-            app.Db().UpdateNewsgroupList(*list);
+            app.GetDb().UpdateNewsgroupList(*list);
            std::cout << "DONE." << std::endl;
            std::cout.flush();
        }
--- a/usenetsearch.example.conf
+++ b/usenetsearch.example.conf
@ -1,16 +1,80 @@
-# NNTP server configuration details
-nntp_server_host: news.example.com
+
+#####################################
+# NNTP server configuration details #
+#####################################
+
+nntp_server_host: my.new.server.example.com
 nntp_server_port: 119
-nntp_server_user: configureMe
-nntp_server_pass: configureMe
+nntp_server_user: someuser
+nntp_server_pass: changeme
 nntp_server_use_ssl: no

-# Index database configuration details
+#################
+# Path settings #
+#################
+
+# database location (relative or absolute path)
 database_path: ./db

-# Parallel processing settings
-max_threads: 2
-batch_size: 5
+####################
+# Storage settings #
+####################

-# Storage settings
-max_tree_depth: 5
+# A higher tree depth creates more search tokens, so it improves the speed and
+# likelyhood of finding search results, at the cost of extra storage
+# requirements, more files, slower indexing.
+
+max_tree_depth: 10
+
+################################
+# Parallel processing settings #
+################################
+
+# If you're processing headers faster than you can pull them down over the
+# network, you're likely not going to need more than 1 or 2 threads, but
+# otherwise, more threads can help. The batch size should be large enough such
+# that all configured threads have enough work to do.
+#
+# The higher your tree max_tree_depth, the more likely you'll need to increase
+# this.
+
+max_threads: 8
+batch_size: 1000
+
+########################
+# Word filter settings #
+########################
+
+# It's important to filter out commonly used words to avoid blowing up an index
+# in size. Huge indexes are going to eat a lot of disk space and slow down
+# searches.
+
+# This setting lists all substrings that should be erased from subjects and
+# search strings before they are tokenized. For instance, you might not want to 
+# store all results for the word "the", or "in" and other stopwords.
+# List of strings is comma-separated and case-insensitive. Each subsequent
+# option appends to the previously defined list.
+
+filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and
+filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
+filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how
+filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
+filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
+filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
+filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes
+filter_erase_subtoken: yet,you,your
+
+# This setting lets you list all tokens that will only be indexed on direct
+# (whole string) matches. Each token is comma-separated, and the configuration
+# option may be listed multiple times as well, each subsequent option appends to
+# the previously defined list. All tokens are case-insensitive.
+
+filter_no_subtoken: makes for,funny business
+
+# Sets the minimum number of words in a sub-token. You may use this if you don't
+# want to index single-words unless they are a direct match to the subject (in 
+# which case, you'd set this to a minimum of 2 words) - or you may even want a
+# higher minimum than that if you're really wanting to optimize search speed and
+# disk usage.
+
+minimum_subtoken_words: 2