Implemented newsgroup filtering, work on resuming indexing where last left off (still buggy)
This commit is contained in:
parent
6082cb3f6c
commit
bb9c3da3d8
|
@ -41,6 +41,7 @@ add_library(usenetsearch
|
||||||
"src/Dns.cpp"
|
"src/Dns.cpp"
|
||||||
"src/Except.cpp"
|
"src/Except.cpp"
|
||||||
"src/Filter.cpp"
|
"src/Filter.cpp"
|
||||||
|
"src/Indexer.cpp"
|
||||||
"src/IoSocket.cpp"
|
"src/IoSocket.cpp"
|
||||||
"src/Serialize.cpp"
|
"src/Serialize.cpp"
|
||||||
"src/SSLConnection.cpp"
|
"src/SSLConnection.cpp"
|
||||||
|
@ -64,7 +65,6 @@ target_link_libraries(usenetsearch
|
||||||
# Indexer executable -----------------------------------------------------------
|
# Indexer executable -----------------------------------------------------------
|
||||||
|
|
||||||
add_executable(usenetindexd
|
add_executable(usenetindexd
|
||||||
"src/Indexer.cpp"
|
|
||||||
"src/usenetindexd.cpp"
|
"src/usenetindexd.cpp"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -94,18 +94,18 @@ target_include_directories(usenetfind
|
||||||
include
|
include
|
||||||
)
|
)
|
||||||
|
|
||||||
# tokendump executable ---------------------------------------------------------
|
# dbdump executable ------------------------------------------------------------
|
||||||
|
|
||||||
add_executable(tokendump
|
add_executable(dbdump
|
||||||
"src/tokendump.cpp"
|
"src/dbdump.cpp"
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(tokendump
|
target_link_libraries(dbdump
|
||||||
PUBLIC ${OPENSSL_LIBRARIES} stdc++fs
|
PUBLIC ${OPENSSL_LIBRARIES} stdc++fs
|
||||||
PRIVATE usenetsearch
|
PRIVATE usenetsearch
|
||||||
)
|
)
|
||||||
|
|
||||||
target_include_directories(tokendump
|
target_include_directories(dbdump
|
||||||
PRIVATE
|
PRIVATE
|
||||||
include
|
include
|
||||||
)
|
)
|
||||||
|
|
|
@ -29,7 +29,7 @@ namespace usenetsearch {
|
||||||
|
|
||||||
enum class CommandLineOptionType
|
enum class CommandLineOptionType
|
||||||
{
|
{
|
||||||
Boolean, Path, String
|
Boolean, Integer, Path, String
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CommandLineOption
|
struct CommandLineOption
|
||||||
|
@ -78,16 +78,25 @@ public:
|
||||||
std::function<void(std::filesystem::path)> onParse,
|
std::function<void(std::filesystem::path)> onParse,
|
||||||
std::filesystem::path defaultValue = "."
|
std::filesystem::path defaultValue = "."
|
||||||
);
|
);
|
||||||
|
|
||||||
|
void AddIntegerOption(
|
||||||
|
char option,
|
||||||
|
const std::string& help,
|
||||||
|
std::function<void(int)> onParse,
|
||||||
|
int defaultValue = 0
|
||||||
|
);
|
||||||
|
|
||||||
void AddStringOption(
|
void AddStringOption(
|
||||||
char option,
|
char option,
|
||||||
const std::string& help,
|
const std::string& help,
|
||||||
std::function<void(std::string)> onParse,
|
std::function<void(std::string)> onParse,
|
||||||
std::string defaultValue = ""
|
std::string defaultValue = ""
|
||||||
);
|
);
|
||||||
|
bool CanRun() const;
|
||||||
Configuration& GetConfig();
|
Configuration& GetConfig();
|
||||||
Database& GetDb();
|
Database& GetDb();
|
||||||
Filter& GetFilter();
|
Filter& GetFilter();
|
||||||
void Init(int argc, char* argv[]);
|
bool Init(int argc, char* argv[]);
|
||||||
void Usage(const std::string& programName);
|
void Usage(const std::string& programName);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
|
#include <regex>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "usenetsearch/Except.h"
|
#include "usenetsearch/Except.h"
|
||||||
|
@ -36,6 +37,8 @@ class Configuration
|
||||||
{
|
{
|
||||||
std::uint16_t m_batchSize{1};
|
std::uint16_t m_batchSize{1};
|
||||||
std::vector<std::string> m_filterEraseSubtoken;
|
std::vector<std::string> m_filterEraseSubtoken;
|
||||||
|
std::vector<std::regex> m_filterNewsgroupBlacklist;
|
||||||
|
std::vector<std::regex> m_filterNewsgroupWhitelist;
|
||||||
std::vector<std::string> m_filterWordsNoSubtoken;
|
std::vector<std::string> m_filterWordsNoSubtoken;
|
||||||
std::uint16_t m_maxThreads{1};
|
std::uint16_t m_maxThreads{1};
|
||||||
std::uint8_t m_maxTreeDepth{5};
|
std::uint8_t m_maxTreeDepth{5};
|
||||||
|
@ -51,8 +54,10 @@ public:
|
||||||
|
|
||||||
std::uint16_t BatchSize() const;
|
std::uint16_t BatchSize() const;
|
||||||
std::filesystem::path DatabasePath() const;
|
std::filesystem::path DatabasePath() const;
|
||||||
std::vector<std::string> FilterEraseSubtoken() const;
|
std::vector<std::string>& FilterEraseSubtoken();
|
||||||
std::vector<std::string> FilterWordsNoSubtoken() const;
|
std::vector<std::regex>& FilterNewsgroupBlacklist();
|
||||||
|
std::vector<std::regex>& FilterNewsgroupWhitelist();
|
||||||
|
std::vector<std::string>& FilterWordsNoSubtoken();
|
||||||
std::uint16_t MaxThreads() const;
|
std::uint16_t MaxThreads() const;
|
||||||
std::uint8_t MaxTreeDepth() const;
|
std::uint8_t MaxTreeDepth() const;
|
||||||
std::uint16_t MinSubtokenWords() const;
|
std::uint16_t MinSubtokenWords() const;
|
||||||
|
|
|
@ -32,6 +32,7 @@
|
||||||
|
|
||||||
namespace usenetsearch {
|
namespace usenetsearch {
|
||||||
|
|
||||||
|
class Application;
|
||||||
static constexpr const std::uint64_t DatabaseVersion{1};
|
static constexpr const std::uint64_t DatabaseVersion{1};
|
||||||
|
|
||||||
struct ArticleEntry
|
struct ArticleEntry
|
||||||
|
@ -55,27 +56,21 @@ struct DatabaseException: public UsenetSearchException
|
||||||
class Database
|
class Database
|
||||||
{
|
{
|
||||||
|
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
|
Application& m_app;
|
||||||
std::filesystem::path m_databasePath;
|
std::filesystem::path m_databasePath;
|
||||||
std::uint64_t m_databaseVersion{DatabaseVersion};
|
std::uint64_t m_databaseVersion{DatabaseVersion};
|
||||||
Filter& m_filter;
|
|
||||||
std::vector<std::filesystem::path> m_lockedFiles;
|
std::vector<std::filesystem::path> m_lockedFiles;
|
||||||
std::mutex m_lockedFilesMutex;
|
std::mutex m_lockedFilesMutex;
|
||||||
std::uint8_t m_maxTreeDepth{5};
|
std::uint8_t m_maxTreeDepth{5};
|
||||||
SerializableFile m_newsGroupFileIO;
|
SerializableFile m_newsGroupFileIO;
|
||||||
|
|
||||||
bool GetArticleEntry(
|
|
||||||
const std::string& subToken,
|
|
||||||
const std::string& searchString,
|
|
||||||
ArticleEntry& entry,
|
|
||||||
size_t& startPosition,
|
|
||||||
size_t& endPosition,
|
|
||||||
size_t& count);
|
|
||||||
|
|
||||||
std::filesystem::path GetTokenFilePath(
|
std::filesystem::path GetTokenFilePath(
|
||||||
const std::string& token,
|
const std::string& token,
|
||||||
bool mkdirs=false
|
bool mkdirs=false
|
||||||
);
|
);
|
||||||
|
std::uint64_t GetUniqueNntpEntryId(
|
||||||
|
const std::vector<NntpListEntry>& list
|
||||||
|
) const;
|
||||||
bool HasToken(
|
bool HasToken(
|
||||||
const std::string& subtoken,
|
const std::string& subtoken,
|
||||||
std::uint64_t newsgroupID,
|
std::uint64_t newsgroupID,
|
||||||
|
@ -94,11 +89,10 @@ class Database
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
explicit Database(Filter& filter);
|
explicit Database(Application& app);
|
||||||
~Database();
|
~Database();
|
||||||
std::unique_ptr<std::vector<NntpHeader>> LoadArticleList(
|
std::unique_ptr<NntpListEntry> FindNntpEntry(const std::string& subject);
|
||||||
const std::wstring& newsgroup
|
std::uint32_t GetLastIndexedArticle(std::uint64_t newsgroupID);
|
||||||
);
|
|
||||||
std::unique_ptr<std::vector<NntpListEntry>> LoadNewsgroupList();
|
std::unique_ptr<std::vector<NntpListEntry>> LoadNewsgroupList();
|
||||||
void MaxTreeDepth(std::uint8_t depth);
|
void MaxTreeDepth(std::uint8_t depth);
|
||||||
void Open(std::filesystem::path dbPath);
|
void Open(std::filesystem::path dbPath);
|
||||||
|
@ -114,11 +108,11 @@ public:
|
||||||
std::unique_ptr<std::vector<ArticleEntry>> Search(
|
std::unique_ptr<std::vector<ArticleEntry>> Search(
|
||||||
const std::string& searchString
|
const std::string& searchString
|
||||||
);
|
);
|
||||||
void UpdateArticleList(
|
void SetLastIndexedArticle(
|
||||||
const std::wstring& newsgroup,
|
std::uint64_t newsgroupID,
|
||||||
const std::vector<NntpHeader>& headers
|
std::int32_t articleID
|
||||||
);
|
);
|
||||||
void UpdateNewsgroupList(const std::vector<NntpListEntry>& list);
|
void UpdateNewsgroupList(std::vector<NntpListEntry>& list);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,6 @@ class Filter
|
||||||
{
|
{
|
||||||
|
|
||||||
Configuration& m_config;
|
Configuration& m_config;
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
|
|
||||||
std::vector<std::string> m_noSubtokenWords;
|
std::vector<std::string> m_noSubtokenWords;
|
||||||
std::unordered_map<std::unique_ptr<std::wregex>, std::wstring>
|
std::unordered_map<std::unique_ptr<std::wregex>, std::wstring>
|
||||||
m_eraseTokenRegexes;
|
m_eraseTokenRegexes;
|
||||||
|
@ -40,11 +39,12 @@ public:
|
||||||
|
|
||||||
explicit Filter(Configuration& config);
|
explicit Filter(Configuration& config);
|
||||||
void Init();
|
void Init();
|
||||||
std::string ProcessSearchString(const std::string& searchString);
|
bool ProcessNewsgroup(const std::string& newsgroup) const;
|
||||||
|
std::string ProcessSearchString(const std::string& searchString) const;
|
||||||
std::string ProcessToken(
|
std::string ProcessToken(
|
||||||
const std::string& token,
|
const std::string& token,
|
||||||
const std::string& searchString
|
const std::string& searchString
|
||||||
);
|
) const;
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,9 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
|
#include <cstdint>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "usenetsearch/Application.h"
|
#include "usenetsearch/Application.h"
|
||||||
#include "usenetsearch/Filter.h"
|
#include "usenetsearch/Filter.h"
|
||||||
|
@ -27,6 +29,36 @@
|
||||||
|
|
||||||
namespace usenetsearch {
|
namespace usenetsearch {
|
||||||
|
|
||||||
|
class SearchResult
|
||||||
|
{
|
||||||
|
|
||||||
|
std::uint32_t m_newsgroupId{0};
|
||||||
|
std::uint32_t m_articleId{0};
|
||||||
|
size_t m_numHits{0};
|
||||||
|
|
||||||
|
public:
|
||||||
|
|
||||||
|
SearchResult() = default;
|
||||||
|
SearchResult(const ArticleEntry& entry);
|
||||||
|
SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId);
|
||||||
|
SearchResult(const SearchResult& other);
|
||||||
|
|
||||||
|
std::uint32_t ArticleId() const;
|
||||||
|
size_t Hits() const;
|
||||||
|
void Inc();
|
||||||
|
std::uint32_t NewsgroupId() const;
|
||||||
|
|
||||||
|
void operator=(const SearchResult& other);
|
||||||
|
bool operator==(const SearchResult& other) const;
|
||||||
|
bool operator!=(const SearchResult& other) const;
|
||||||
|
bool operator<(const SearchResult& other) const;
|
||||||
|
bool operator>(const SearchResult& other) const;
|
||||||
|
bool operator>=(const SearchResult& other) const;
|
||||||
|
bool operator<=(const SearchResult& other) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef std::vector<SearchResult> SearchResults;
|
||||||
|
|
||||||
class Indexer
|
class Indexer
|
||||||
{
|
{
|
||||||
Application& m_app;
|
Application& m_app;
|
||||||
|
@ -40,6 +72,9 @@ public:
|
||||||
|
|
||||||
void Connect();
|
void Connect();
|
||||||
void Index(const std::vector<NntpListEntry>& newsgroups);
|
void Index(const std::vector<NntpListEntry>& newsgroups);
|
||||||
|
std::unique_ptr<SearchResults> Search(
|
||||||
|
const std::string& searchString
|
||||||
|
);
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,8 @@ std::string CharToHex(const char c);
|
||||||
|
|
||||||
std::string HashBytesToString(const std::array<std::uint8_t, 16>& input);
|
std::string HashBytesToString(const std::array<std::uint8_t, 16>& input);
|
||||||
|
|
||||||
|
std::string StringFromWideString(const std::wstring& input);
|
||||||
|
|
||||||
std::string StringHash(const std::string& input);
|
std::string StringHash(const std::string& input);
|
||||||
|
|
||||||
std::array<std::uint8_t, 16> StringHashBytes(const std::string& input);
|
std::array<std::uint8_t, 16> StringHashBytes(const std::string& input);
|
||||||
|
@ -137,4 +139,6 @@ void StringTreeOperation(
|
||||||
std::function<void(const std::string& subToken, const std::string& str)> Fn
|
std::function<void(const std::string& subToken, const std::string& str)> Fn
|
||||||
);
|
);
|
||||||
|
|
||||||
|
std::wstring WideStringFromString(const std::string& input);
|
||||||
|
|
||||||
} // namespace usenetsearch
|
} // namespace usenetsearch
|
||||||
|
|
|
@ -31,6 +31,8 @@
|
||||||
|
|
||||||
namespace usenetsearch {
|
namespace usenetsearch {
|
||||||
|
|
||||||
|
class Application;
|
||||||
|
|
||||||
struct UsenetClientException: public UsenetSearchException
|
struct UsenetClientException: public UsenetSearchException
|
||||||
{
|
{
|
||||||
UsenetClientException(int errorCode, const std::string& message):
|
UsenetClientException(int errorCode, const std::string& message):
|
||||||
|
@ -59,16 +61,19 @@ struct NntpMessage
|
||||||
|
|
||||||
struct NntpListEntry
|
struct NntpListEntry
|
||||||
{
|
{
|
||||||
std::string name;
|
std::uint64_t id;
|
||||||
|
std::uint32_t lastIndexedArticle;
|
||||||
|
std::uint64_t count;
|
||||||
std::uint64_t high;
|
std::uint64_t high;
|
||||||
std::uint64_t low;
|
std::uint64_t low;
|
||||||
std::uint64_t count;
|
std::string name;
|
||||||
std::string status;
|
std::string status;
|
||||||
};
|
};
|
||||||
|
|
||||||
class UsenetClient
|
class UsenetClient
|
||||||
{
|
{
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> m_conv;
|
|
||||||
|
Application& m_app;
|
||||||
std::unique_ptr<SSLConnection> m_ssl;
|
std::unique_ptr<SSLConnection> m_ssl;
|
||||||
std::unique_ptr<TcpConnection> m_tcp;
|
std::unique_ptr<TcpConnection> m_tcp;
|
||||||
bool m_useSSL{false};
|
bool m_useSSL{false};
|
||||||
|
@ -80,14 +85,7 @@ class UsenetClient
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
||||||
/* Expected flow:
|
UsenetClient(Application& app);
|
||||||
* Connect
|
|
||||||
* Authenticate
|
|
||||||
* List() to get a list of newsgroups
|
|
||||||
* for every newsgroup:
|
|
||||||
* XZHDR subject 0-
|
|
||||||
* uncompress result.
|
|
||||||
*/
|
|
||||||
|
|
||||||
void Authenticate(const std::wstring& user, const std::wstring& password);
|
void Authenticate(const std::wstring& user, const std::wstring& password);
|
||||||
|
|
||||||
|
|
|
@ -41,6 +41,21 @@ void Application::AddBooleanOption(
|
||||||
m_commandLineArguments.emplace_back(std::move(val));
|
m_commandLineArguments.emplace_back(std::move(val));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Application::AddIntegerOption(
|
||||||
|
char option,
|
||||||
|
const std::string& help,
|
||||||
|
std::function<void(int)> onParse,
|
||||||
|
int defaultValue)
|
||||||
|
{
|
||||||
|
auto val = std::make_shared<CommandLineOptionValue<int>>();
|
||||||
|
val->type = CommandLineOptionType::Integer;
|
||||||
|
val->option = option;
|
||||||
|
val->helpText = help;
|
||||||
|
val->value = defaultValue;
|
||||||
|
val->onParse = onParse;
|
||||||
|
m_commandLineArguments.emplace_back(std::move(val));
|
||||||
|
}
|
||||||
|
|
||||||
void Application::AddFileOption(
|
void Application::AddFileOption(
|
||||||
char option,
|
char option,
|
||||||
const std::string& help,
|
const std::string& help,
|
||||||
|
@ -91,6 +106,9 @@ void Application::Usage(const std::string& programName)
|
||||||
case CommandLineOptionType::Boolean:
|
case CommandLineOptionType::Boolean:
|
||||||
std::cout << "[-" << optionValue->option << "] ";
|
std::cout << "[-" << optionValue->option << "] ";
|
||||||
break;
|
break;
|
||||||
|
case CommandLineOptionType::Integer:
|
||||||
|
std::cout << "[-" << optionValue->option << " <number>] ";
|
||||||
|
break;
|
||||||
case CommandLineOptionType::Path:
|
case CommandLineOptionType::Path:
|
||||||
std::cout << "[-" << optionValue->option << " <path>] ";
|
std::cout << "[-" << optionValue->option << " <path>] ";
|
||||||
break;
|
break;
|
||||||
|
@ -112,6 +130,10 @@ void Application::Usage(const std::string& programName)
|
||||||
std::cout << "-" << optionValue->option << "\t"
|
std::cout << "-" << optionValue->option << "\t"
|
||||||
<< optionValue->helpText << std::endl;
|
<< optionValue->helpText << std::endl;
|
||||||
break;
|
break;
|
||||||
|
case CommandLineOptionType::Integer:
|
||||||
|
std::cout << "-" << optionValue->option << " <number>\t"
|
||||||
|
<< optionValue->helpText << std::endl;
|
||||||
|
break;
|
||||||
case CommandLineOptionType::Path:
|
case CommandLineOptionType::Path:
|
||||||
std::cout << "-" << optionValue->option << " <path>\t"
|
std::cout << "-" << optionValue->option << " <path>\t"
|
||||||
<< optionValue->helpText << std::endl;
|
<< optionValue->helpText << std::endl;
|
||||||
|
@ -125,11 +147,16 @@ void Application::Usage(const std::string& programName)
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
Application::Application() : m_db(m_filter), m_filter(m_config)
|
Application::Application() : m_db(*this), m_filter(m_config)
|
||||||
{
|
{
|
||||||
std::cout.setf(std::ios::unitbuf);
|
std::cout.setf(std::ios::unitbuf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool Application::CanRun() const
|
||||||
|
{
|
||||||
|
return m_canRun;
|
||||||
|
}
|
||||||
|
|
||||||
Configuration& Application::GetConfig()
|
Configuration& Application::GetConfig()
|
||||||
{
|
{
|
||||||
return m_config;
|
return m_config;
|
||||||
|
@ -161,6 +188,21 @@ void Application::ExecuteCustomOption(
|
||||||
castedOption->onParse(true);
|
castedOption->onParse(true);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
case CommandLineOptionType::Integer:
|
||||||
|
{
|
||||||
|
std::shared_ptr<CommandLineOptionValue<int>> castedOption =
|
||||||
|
std::dynamic_pointer_cast<CommandLineOptionValue<int>>(
|
||||||
|
opt
|
||||||
|
);
|
||||||
|
if (castedOption == nullptr)
|
||||||
|
{
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Could not cast cli arg to the correct type."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
castedOption->onParse(std::stoi(value));
|
||||||
|
}
|
||||||
|
break;
|
||||||
case CommandLineOptionType::String:
|
case CommandLineOptionType::String:
|
||||||
{
|
{
|
||||||
std::shared_ptr<CommandLineOptionValue<std::string>>
|
std::shared_ptr<CommandLineOptionValue<std::string>>
|
||||||
|
@ -203,15 +245,16 @@ Filter& Application::GetFilter()
|
||||||
return m_filter;
|
return m_filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::Init(int argc, char* argv[])
|
bool Application::Init(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
ParseArgs(argc, argv);
|
ParseArgs(argc, argv);
|
||||||
if (!m_canRun) return;
|
if (!m_canRun) return false;
|
||||||
// Read config, setup db
|
// Read config, setup db
|
||||||
m_config.Open(m_configFile);
|
m_config.Open(m_configFile);
|
||||||
m_db.MaxTreeDepth(m_config.MaxTreeDepth());
|
m_db.MaxTreeDepth(m_config.MaxTreeDepth());
|
||||||
m_db.Open(m_config.DatabasePath());
|
m_db.Open(m_config.DatabasePath());
|
||||||
m_filter.Init();
|
m_filter.Init();
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Application::ParseArgs(int argc, char* argv[])
|
void Application::ParseArgs(int argc, char* argv[])
|
||||||
|
|
|
@ -17,8 +17,10 @@
|
||||||
|
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <regex>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#include "usenetsearch/ScopeExit.h"
|
||||||
#include "usenetsearch/StringUtils.h"
|
#include "usenetsearch/StringUtils.h"
|
||||||
|
|
||||||
#include "usenetsearch/Configuration.h"
|
#include "usenetsearch/Configuration.h"
|
||||||
|
@ -35,12 +37,22 @@ std::filesystem::path Configuration::DatabasePath() const
|
||||||
return m_databasePath;
|
return m_databasePath;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> Configuration::FilterEraseSubtoken() const
|
std::vector<std::string>& Configuration::FilterEraseSubtoken()
|
||||||
{
|
{
|
||||||
return m_filterEraseSubtoken;
|
return m_filterEraseSubtoken;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::string> Configuration::FilterWordsNoSubtoken() const
|
std::vector<std::regex>& Configuration::FilterNewsgroupBlacklist()
|
||||||
|
{
|
||||||
|
return m_filterNewsgroupBlacklist;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::regex>& Configuration::FilterNewsgroupWhitelist()
|
||||||
|
{
|
||||||
|
return m_filterNewsgroupWhitelist;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string>& Configuration::FilterWordsNoSubtoken()
|
||||||
{
|
{
|
||||||
return m_filterWordsNoSubtoken;
|
return m_filterWordsNoSubtoken;
|
||||||
}
|
}
|
||||||
|
@ -95,6 +107,7 @@ void Configuration::Open(const std::string& filename)
|
||||||
"Could not open configuration file: " + filename
|
"Could not open configuration file: " + filename
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
ScopeExit finCloser([&fin](){ fin.close(); });
|
||||||
int line_nr = 0;
|
int line_nr = 0;
|
||||||
while(std::getline(fin,line))
|
while(std::getline(fin,line))
|
||||||
{
|
{
|
||||||
|
@ -108,7 +121,6 @@ void Configuration::Open(const std::string& filename)
|
||||||
const auto kvp = StringSplit(line, std::string{":"}, 2);
|
const auto kvp = StringSplit(line, std::string{":"}, 2);
|
||||||
if (kvp.size() != 2)
|
if (kvp.size() != 2)
|
||||||
{
|
{
|
||||||
fin.close();
|
|
||||||
throw ConfigurationException(EINVAL,
|
throw ConfigurationException(EINVAL,
|
||||||
std::string("Invalid configuration in ")
|
std::string("Invalid configuration in ")
|
||||||
+ filename + std::string(" line ")
|
+ filename + std::string(" line ")
|
||||||
|
@ -137,6 +149,40 @@ void Configuration::Open(const std::string& filename)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
else if (key == "filter_newsgroup_blacklist")
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
std::regex re(value);
|
||||||
|
m_filterNewsgroupBlacklist.emplace_back(re);
|
||||||
|
}
|
||||||
|
catch (const std::regex_error& e)
|
||||||
|
{
|
||||||
|
throw ConfigurationException(EINVAL,
|
||||||
|
std::string("Invalid configuration in ")
|
||||||
|
+ filename + std::string(" line ")
|
||||||
|
+ std::to_string(line_nr) + " : Regular expression \""
|
||||||
|
+ value + "\" did not parse: " + e.what()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (key == "filter_newsgroup_whitelist")
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
std::regex re(value);
|
||||||
|
m_filterNewsgroupWhitelist.emplace_back(re);
|
||||||
|
}
|
||||||
|
catch (const std::regex_error& e)
|
||||||
|
{
|
||||||
|
throw ConfigurationException(EINVAL,
|
||||||
|
std::string("Invalid configuration in ")
|
||||||
|
+ filename + std::string(" line ")
|
||||||
|
+ std::to_string(line_nr) + " : Regular expression \""
|
||||||
|
+ value + "\" did not parse: " + e.what()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
else if (key == "filter_no_subtoken")
|
else if (key == "filter_no_subtoken")
|
||||||
{
|
{
|
||||||
const auto tokens = StringSplit(value, std::string{","});
|
const auto tokens = StringSplit(value, std::string{","});
|
||||||
|
@ -195,7 +241,6 @@ void Configuration::Open(const std::string& filename)
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
fin.close();
|
|
||||||
throw ConfigurationException(EINVAL,
|
throw ConfigurationException(EINVAL,
|
||||||
std::string("Invalid configuration in ")
|
std::string("Invalid configuration in ")
|
||||||
+ filename + std::string(" line ")
|
+ filename + std::string(" line ")
|
||||||
|
@ -203,7 +248,6 @@ void Configuration::Open(const std::string& filename)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fin.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace usenetsearch
|
} // namespace usenetsearch
|
||||||
|
|
187
src/Database.cpp
187
src/Database.cpp
|
@ -24,6 +24,7 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "usenetsearch/Application.h"
|
||||||
#include "usenetsearch/StringUtils.h"
|
#include "usenetsearch/StringUtils.h"
|
||||||
#include "usenetsearch/UsenetClient.h"
|
#include "usenetsearch/UsenetClient.h"
|
||||||
#include "usenetsearch/ScopeExit.h"
|
#include "usenetsearch/ScopeExit.h"
|
||||||
|
@ -35,7 +36,7 @@ namespace usenetsearch {
|
||||||
|
|
||||||
// Database class --------------------------------------------------------------
|
// Database class --------------------------------------------------------------
|
||||||
|
|
||||||
Database::Database(Filter& filter): m_filter(filter)
|
Database::Database(Application& app): m_app(app)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,15 +45,41 @@ Database::~Database()
|
||||||
m_newsGroupFileIO.Close();
|
m_newsGroupFileIO.Close();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Database::GetArticleEntry(
|
std::unique_ptr<NntpListEntry> Database::FindNntpEntry(
|
||||||
const std::string& subToken,
|
const std::string& subject)
|
||||||
const std::string& searchString,
|
|
||||||
ArticleEntry& entry,
|
|
||||||
size_t& startPosition,
|
|
||||||
size_t& endPosition,
|
|
||||||
size_t& count)
|
|
||||||
{
|
{
|
||||||
return false;
|
OpenNewsGroupFile();
|
||||||
|
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
|
||||||
|
const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64();
|
||||||
|
std::unique_ptr<NntpListEntry> result = nullptr;
|
||||||
|
for (std::uint64_t n = 0; n != numGroups; ++n)
|
||||||
|
{
|
||||||
|
NntpListEntry entry;
|
||||||
|
m_newsGroupFileIO >> entry;
|
||||||
|
if (entry.name == subject)
|
||||||
|
{
|
||||||
|
result = std::make_unique<NntpListEntry>(entry);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID)
|
||||||
|
{
|
||||||
|
OpenNewsGroupFile();
|
||||||
|
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
|
||||||
|
const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64();
|
||||||
|
for (std::uint64_t n = 0; n != numGroups; ++n)
|
||||||
|
{
|
||||||
|
NntpListEntry entry;
|
||||||
|
m_newsGroupFileIO >> entry;
|
||||||
|
if (entry.id == newsgroupID)
|
||||||
|
{
|
||||||
|
return entry.lastIndexedArticle;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::filesystem::path Database::GetTokenFilePath(
|
std::filesystem::path Database::GetTokenFilePath(
|
||||||
|
@ -77,29 +104,29 @@ std::filesystem::path Database::GetTokenFilePath(
|
||||||
return groupPath / groupFile;
|
return groupPath / groupFile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::uint64_t Database::GetUniqueNntpEntryId(
|
||||||
|
const std::vector<NntpListEntry>& list) const
|
||||||
|
{
|
||||||
|
std::uint64_t result{0};
|
||||||
|
for (auto& entry: list)
|
||||||
|
{
|
||||||
|
if (result <= entry.id)
|
||||||
|
{
|
||||||
|
result = entry.id + 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
void Database::MaxTreeDepth(std::uint8_t depth)
|
void Database::MaxTreeDepth(std::uint8_t depth)
|
||||||
{
|
{
|
||||||
m_maxTreeDepth = depth;
|
m_maxTreeDepth = depth;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<std::vector<NntpHeader>> Database::LoadArticleList(
|
|
||||||
const std::wstring& newsgroup)
|
|
||||||
{
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
|
std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
|
||||||
{
|
{
|
||||||
OpenNewsGroupFile();
|
OpenNewsGroupFile();
|
||||||
const std::uint64_t dbVersion = m_newsGroupFileIO.ReadInt64();
|
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
|
||||||
if (dbVersion != m_databaseVersion)
|
|
||||||
{
|
|
||||||
throw DatabaseException(EINVAL,
|
|
||||||
"The loaded database version (" + std::to_string(dbVersion)
|
|
||||||
+ ") does not match the current database version ("
|
|
||||||
+ std::to_string(m_databaseVersion) + ")");
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t newsGroupCount = m_newsGroupFileIO.ReadInt64();
|
const size_t newsGroupCount = m_newsGroupFileIO.ReadInt64();
|
||||||
|
|
||||||
auto result = std::make_unique<std::vector<NntpListEntry>>();
|
auto result = std::make_unique<std::vector<NntpListEntry>>();
|
||||||
|
@ -119,18 +146,37 @@ void Database::Open(std::filesystem::path dbPath)
|
||||||
{
|
{
|
||||||
std::filesystem::create_directory(dbPath);
|
std::filesystem::create_directory(dbPath);
|
||||||
}
|
}
|
||||||
OpenNewsGroupFile();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::OpenNewsGroupFile()
|
void Database::OpenNewsGroupFile()
|
||||||
{
|
{
|
||||||
if (m_newsGroupFileIO.IsOpen())
|
if (m_newsGroupFileIO.IsOpen())
|
||||||
{
|
{
|
||||||
|
m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
const std::filesystem::path newsGroupFilePath =
|
const std::filesystem::path newsGroupFilePath =
|
||||||
m_databasePath / "newsgroups.db";
|
m_databasePath / "newsgroups.db";
|
||||||
|
const bool exists = std::filesystem::exists(newsGroupFilePath);
|
||||||
m_newsGroupFileIO.Open(newsGroupFilePath);
|
m_newsGroupFileIO.Open(newsGroupFilePath);
|
||||||
|
if (exists)
|
||||||
|
{
|
||||||
|
const std::uint64_t ver = m_newsGroupFileIO.ReadInt64();
|
||||||
|
if (ver != m_databaseVersion)
|
||||||
|
{
|
||||||
|
throw DatabaseException(EBADF,
|
||||||
|
std::string{"Mismatching newgroup file database version:"}
|
||||||
|
+ " have: " + std::to_string(ver) + " - want: "
|
||||||
|
+ std::to_string(m_databaseVersion)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_newsGroupFileIO << m_databaseVersion;
|
||||||
|
m_newsGroupFileIO << std::uint64_t{0}; // newsgroup count.
|
||||||
|
m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::ParseTokenFile(
|
void Database::ParseTokenFile(
|
||||||
|
@ -155,26 +201,30 @@ void Database::ParseTokenFile(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::UpdateArticleList(
|
void Database::SetLastIndexedArticle(
|
||||||
const std::wstring& newsgroup,
|
std::uint64_t newsgroupID,
|
||||||
const std::vector<NntpHeader>& headers)
|
std::int32_t articleID)
|
||||||
{
|
{
|
||||||
|
auto outItems = LoadNewsgroupList();
|
||||||
}
|
bool found{false};
|
||||||
|
if (outItems)
|
||||||
void Database::UpdateNewsgroupList(const std::vector<NntpListEntry>& list)
|
|
||||||
{
|
|
||||||
OpenNewsGroupFile();
|
|
||||||
|
|
||||||
m_newsGroupFileIO << m_databaseVersion;
|
|
||||||
|
|
||||||
const std::uint64_t newsGroupCount = list.size();
|
|
||||||
m_newsGroupFileIO << newsGroupCount;
|
|
||||||
|
|
||||||
for (const auto& entry: list)
|
|
||||||
{
|
{
|
||||||
m_newsGroupFileIO << entry;
|
for (auto& entry: *outItems)
|
||||||
|
{
|
||||||
|
if (entry.id == newsgroupID)
|
||||||
|
{
|
||||||
|
entry.lastIndexedArticle = articleID;
|
||||||
|
found = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
throw DatabaseException(EINVAL,
|
||||||
|
"Attempt to update newsgroup not found in database - id: "
|
||||||
|
+ std::to_string(newsgroupID));
|
||||||
|
}
|
||||||
|
UpdateNewsgroupList(*outItems);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Database::SaveSearchTokens(
|
void Database::SaveSearchTokens(
|
||||||
|
@ -188,7 +238,10 @@ void Database::SaveSearchTokens(
|
||||||
" ",
|
" ",
|
||||||
m_maxTreeDepth,
|
m_maxTreeDepth,
|
||||||
[&](const std::string& subToken, const std::string& str){
|
[&](const std::string& subToken, const std::string& str){
|
||||||
const std::string tok = m_filter.ProcessToken(subToken, str);
|
const std::string tok = m_app.GetFilter().ProcessToken(
|
||||||
|
subToken,
|
||||||
|
str
|
||||||
|
);
|
||||||
if (tok.empty()) return;
|
if (tok.empty()) return;
|
||||||
SaveToken(tok, newsgroupID, articleID);
|
SaveToken(tok, newsgroupID, articleID);
|
||||||
}
|
}
|
||||||
|
@ -311,4 +364,52 @@ std::unique_ptr<std::vector<ArticleEntry>> Database::Search(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Database::UpdateNewsgroupList(std::vector<NntpListEntry>& list)
|
||||||
|
{
|
||||||
|
if (list.size() == 0) return;
|
||||||
|
|
||||||
|
auto outList = LoadNewsgroupList();
|
||||||
|
for (auto& entry: list)
|
||||||
|
{
|
||||||
|
NntpListEntry newEntry(entry);
|
||||||
|
bool found{false};
|
||||||
|
if (outList)
|
||||||
|
{
|
||||||
|
std::for_each(
|
||||||
|
outList->begin(),
|
||||||
|
outList->end(),
|
||||||
|
[&entry, &found](NntpListEntry& oldEntry)
|
||||||
|
{
|
||||||
|
if (oldEntry.name == entry.name)
|
||||||
|
{
|
||||||
|
// update existing (copy everything but ID & name)
|
||||||
|
found = true;
|
||||||
|
oldEntry.count = entry.count;
|
||||||
|
oldEntry.high = entry.high;
|
||||||
|
oldEntry.lastIndexedArticle = entry.lastIndexedArticle;
|
||||||
|
oldEntry.low = entry.low;
|
||||||
|
oldEntry.status = entry.status;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (found) continue;
|
||||||
|
// add new.
|
||||||
|
newEntry.id = GetUniqueNntpEntryId(*outList);
|
||||||
|
outList->emplace_back(newEntry);
|
||||||
|
entry.id = newEntry.id;
|
||||||
|
}
|
||||||
|
OpenNewsGroupFile();
|
||||||
|
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
|
||||||
|
m_newsGroupFileIO << std::uint64_t{outList->size()};
|
||||||
|
std::for_each(
|
||||||
|
outList->begin(),
|
||||||
|
outList->end(),
|
||||||
|
[&](const NntpListEntry& e)
|
||||||
|
{
|
||||||
|
m_newsGroupFileIO << e;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace usenetsearch
|
} // namespace usenetsearch
|
||||||
|
|
|
@ -36,7 +36,7 @@ void Filter::Init()
|
||||||
// Pre-compile regexes for all the subtokens that should be erased.
|
// Pre-compile regexes for all the subtokens that should be erased.
|
||||||
std::for_each(eraseTokens.begin(), eraseTokens.end(),
|
std::for_each(eraseTokens.begin(), eraseTokens.end(),
|
||||||
[&](const std::string& tok){
|
[&](const std::string& tok){
|
||||||
const std::wstring wtok = m_conv.from_bytes(tok);
|
const std::wstring wtok = WideStringFromString(tok);
|
||||||
m_eraseTokenRegexes.emplace(
|
m_eraseTokenRegexes.emplace(
|
||||||
std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
|
std::make_unique<std::wregex>(L"^" + wtok + L"\\s+"),
|
||||||
std::wstring{}
|
std::wstring{}
|
||||||
|
@ -53,17 +53,34 @@ void Filter::Init()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Filter::ProcessSearchString(const std::string& searchString)
|
bool Filter::ProcessNewsgroup(const std::string& newsgroup) const
|
||||||
{
|
{
|
||||||
std::wstring str;
|
for (const auto& blackRe: m_config.FilterNewsgroupBlacklist())
|
||||||
try
|
|
||||||
{
|
{
|
||||||
str = m_conv.from_bytes(searchString);
|
std::smatch matches;
|
||||||
|
if (std::regex_match(newsgroup, matches, blackRe))
|
||||||
|
{
|
||||||
|
if (matches.size() > 0) return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (const std::range_error&)
|
if (m_config.FilterNewsgroupWhitelist().size() > 0)
|
||||||
{
|
{
|
||||||
return ""; // string is not valid utf8
|
for (const auto& whiteRe: m_config.FilterNewsgroupWhitelist())
|
||||||
|
{
|
||||||
|
std::smatch matches;
|
||||||
|
if (std::regex_match(newsgroup, matches, whiteRe))
|
||||||
|
{
|
||||||
|
if (matches.size() > 0) return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Filter::ProcessSearchString(const std::string& searchString) const
|
||||||
|
{
|
||||||
|
std::wstring str = WideStringFromString(searchString);
|
||||||
std::remove_if(str.begin(), str.end(), [](wchar_t c){
|
std::remove_if(str.begin(), str.end(), [](wchar_t c){
|
||||||
// Remove control characters.
|
// Remove control characters.
|
||||||
if (c < 0x20) return true; // ascii control chars
|
if (c < 0x20) return true; // ascii control chars
|
||||||
|
@ -89,21 +106,12 @@ std::string Filter::ProcessSearchString(const std::string& searchString)
|
||||||
// Convert strings that are ONLY whitespace to blank strings.
|
// Convert strings that are ONLY whitespace to blank strings.
|
||||||
static std::wregex rxAllWhitespace(L"^\\s+$");
|
static std::wregex rxAllWhitespace(L"^\\s+$");
|
||||||
str = std::regex_replace(str, rxAllWhitespace, L"");
|
str = std::regex_replace(str, rxAllWhitespace, L"");
|
||||||
std::string result;
|
return StringFromWideString(str);
|
||||||
try
|
|
||||||
{
|
|
||||||
result = m_conv.to_bytes(str);
|
|
||||||
}
|
|
||||||
catch (const std::range_error&)
|
|
||||||
{
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string Filter::ProcessToken(
|
std::string Filter::ProcessToken(
|
||||||
const std::string& token,
|
const std::string& token,
|
||||||
const std::string& searchString)
|
const std::string& searchString) const
|
||||||
{
|
{
|
||||||
std::string result = token;
|
std::string result = token;
|
||||||
// Process the nosubtokens list.
|
// Process the nosubtokens list.
|
||||||
|
|
152
src/Indexer.cpp
152
src/Indexer.cpp
|
@ -23,6 +23,92 @@
|
||||||
|
|
||||||
namespace usenetsearch {
|
namespace usenetsearch {
|
||||||
|
|
||||||
|
// SearchResult class ----------------------------------------------------------
|
||||||
|
|
||||||
|
SearchResult::SearchResult(const ArticleEntry& entry)
|
||||||
|
{
|
||||||
|
m_newsgroupId = entry.newsgroupID;
|
||||||
|
m_articleId = entry.articleID;
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId):
|
||||||
|
m_newsgroupId(newsgroupId), m_articleId(articleId)
|
||||||
|
{
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
SearchResult::SearchResult(const SearchResult& other)
|
||||||
|
{
|
||||||
|
m_articleId = other.m_articleId;
|
||||||
|
m_newsgroupId = other.m_newsgroupId;
|
||||||
|
m_numHits = other.m_numHits;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::uint32_t SearchResult::ArticleId() const
|
||||||
|
{
|
||||||
|
return m_articleId;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t SearchResult::Hits() const
|
||||||
|
{
|
||||||
|
return m_numHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchResult::Inc()
|
||||||
|
{
|
||||||
|
m_numHits++;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uint32_t SearchResult::NewsgroupId() const
|
||||||
|
{
|
||||||
|
return m_newsgroupId;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SearchResult::operator=(const SearchResult& other)
|
||||||
|
{
|
||||||
|
m_articleId = other.m_articleId;
|
||||||
|
m_newsgroupId = other.m_newsgroupId;
|
||||||
|
m_numHits = other.m_numHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchResult::operator==(const SearchResult& other) const
|
||||||
|
{
|
||||||
|
const bool result =
|
||||||
|
(m_articleId == other.m_articleId)
|
||||||
|
&& (m_newsgroupId == other.m_newsgroupId);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchResult::operator!=(const SearchResult& other) const
|
||||||
|
{
|
||||||
|
return (
|
||||||
|
(m_articleId != other.m_articleId)
|
||||||
|
|| (m_newsgroupId != other.m_newsgroupId)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchResult::operator<(const SearchResult& other) const
|
||||||
|
{
|
||||||
|
return m_numHits < other.m_numHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchResult::operator>(const SearchResult& other) const
|
||||||
|
{
|
||||||
|
return m_numHits > other.m_numHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchResult::operator>=(const SearchResult& other) const
|
||||||
|
{
|
||||||
|
return m_numHits >= other.m_numHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SearchResult::operator<=(const SearchResult& other) const
|
||||||
|
{
|
||||||
|
return m_numHits <= other.m_numHits;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Indexer class ---------------------------------------------------------------
|
||||||
|
|
||||||
Indexer::Indexer(Application& app, UsenetClient& client)
|
Indexer::Indexer(Application& app, UsenetClient& client)
|
||||||
: m_app(app), m_client(client)
|
: m_app(app), m_client(client)
|
||||||
{
|
{
|
||||||
|
@ -59,20 +145,41 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
|
||||||
<< "(.=" << batchSize << " headers)." << std::endl;
|
<< "(.=" << batchSize << " headers)." << std::endl;
|
||||||
std::cout.flush();
|
std::cout.flush();
|
||||||
std::atomic<std::uint64_t> headerCount{0};
|
std::atomic<std::uint64_t> headerCount{0};
|
||||||
|
const std::atomic<std::uint64_t> groupID = group.id;
|
||||||
std::reference_wrapper<Database> dbref = std::ref(m_app.GetDb());
|
std::reference_wrapper<Database> dbref = std::ref(m_app.GetDb());
|
||||||
m_client.ProcessHeaders(0,
|
const std::uint32_t startMessage = dbref.get().GetLastIndexedArticle(
|
||||||
[this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
|
groupID
|
||||||
m_threads.Queue([this, headers, &headerCount, &dbref](){
|
);
|
||||||
|
std::cout << "Indexing starting at message: "
|
||||||
|
<< std::to_string(startMessage) << std::endl;
|
||||||
|
m_client.ProcessHeaders(startMessage,
|
||||||
|
[this, &headerCount, &dbref, &groupID](std::shared_ptr<NntpHeaders> headers){
|
||||||
|
m_threads.Queue([this, headers, &headerCount, &dbref, &groupID](){
|
||||||
|
std::uint64_t lastArticle{0};
|
||||||
for (const auto& header: *headers)
|
for (const auto& header: *headers)
|
||||||
{
|
{
|
||||||
const std::uint64_t id{header.articleID};
|
const std::uint64_t articleID{header.articleID};
|
||||||
std::string subject = header.subject;
|
std::string subject = header.subject;
|
||||||
subject = m_app.GetFilter().ProcessSearchString(
|
subject = m_app.GetFilter().ProcessSearchString(
|
||||||
subject
|
subject
|
||||||
);
|
);
|
||||||
if (subject == "") continue;
|
if (subject == "") continue;
|
||||||
dbref.get().SaveSearchTokens(1, id, subject);
|
dbref.get().SaveSearchTokens(
|
||||||
|
groupID,
|
||||||
|
articleID,
|
||||||
|
subject
|
||||||
|
);
|
||||||
headerCount++;
|
headerCount++;
|
||||||
|
if (articleID > lastArticle) lastArticle = articleID;
|
||||||
|
}
|
||||||
|
// Update last-indexed id for the newsgroup.
|
||||||
|
const std::uint32_t lastIndexedID =
|
||||||
|
dbref.get().GetLastIndexedArticle(groupID);
|
||||||
|
if (lastIndexedID < lastArticle)
|
||||||
|
{
|
||||||
|
dbref.get().SetLastIndexedArticle(
|
||||||
|
groupID, lastArticle
|
||||||
|
);
|
||||||
}
|
}
|
||||||
std::cout << ".";
|
std::cout << ".";
|
||||||
std::cout.flush();
|
std::cout.flush();
|
||||||
|
@ -87,4 +194,39 @@ void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<SearchResults> Indexer::Search(
|
||||||
|
const std::string& searchString)
|
||||||
|
{
|
||||||
|
auto result = std::make_unique<SearchResults>();
|
||||||
|
const std::string sstr = m_app.GetFilter().ProcessSearchString(
|
||||||
|
searchString
|
||||||
|
);
|
||||||
|
auto searchResults = m_app.GetDb().Search(sstr);
|
||||||
|
if (!searchResults) return result;
|
||||||
|
for(const ArticleEntry& entry: *searchResults)
|
||||||
|
{
|
||||||
|
SearchResult sr(entry);
|
||||||
|
// Check if a matching entry already exists in the result set, if so,
|
||||||
|
// increment count. Otherwise, append a new entry.
|
||||||
|
auto it = std::find(result->begin(), result->end(), sr);
|
||||||
|
if (it != result->end())
|
||||||
|
{
|
||||||
|
(*it).Inc();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
sr.Inc();
|
||||||
|
result->emplace_back(sr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::sort(
|
||||||
|
result->begin(),
|
||||||
|
result->end(),
|
||||||
|
[](const SearchResult& a, const SearchResult& b){
|
||||||
|
return a.Hits() > b.Hits();
|
||||||
|
}
|
||||||
|
);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace usenetsearch
|
} // namespace usenetsearch
|
||||||
|
|
|
@ -170,7 +170,7 @@ std::string SerializableFile::ReadStr(size_t size) const
|
||||||
RangeUnlock(startPos, size);
|
RangeUnlock(startPos, size);
|
||||||
});
|
});
|
||||||
size_t bytesRead{0};
|
size_t bytesRead{0};
|
||||||
std::string result(size + 1, '\0');
|
std::string result(size, '\0');
|
||||||
while (bytesRead < size)
|
while (bytesRead < size)
|
||||||
{
|
{
|
||||||
const auto readNow = read(m_fd, &result[0], size);
|
const auto readNow = read(m_fd, &result[0], size);
|
||||||
|
@ -485,21 +485,49 @@ SerializableFile& operator>>(SerializableFile& in, NntpHeader& obj)
|
||||||
|
|
||||||
SerializableFile& operator<<(SerializableFile& out, const NntpListEntry& obj)
|
SerializableFile& operator<<(SerializableFile& out, const NntpListEntry& obj)
|
||||||
{
|
{
|
||||||
|
out.Write(std::uint8_t{1}); // start of heading
|
||||||
|
out.Write(std::uint8_t{2}); // start of text
|
||||||
|
out << obj.id;
|
||||||
|
out << obj.lastIndexedArticle;
|
||||||
out << obj.count;
|
out << obj.count;
|
||||||
out << obj.high;
|
out << obj.high;
|
||||||
out << obj.low;
|
out << obj.low;
|
||||||
out << obj.name;
|
out << obj.name;
|
||||||
out << obj.status;
|
out << obj.status;
|
||||||
|
out.Write(std::uint8_t{3}); // end of text
|
||||||
|
out.Write(std::uint8_t{4}); // end of transmission
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
SerializableFile& operator>>(SerializableFile& in, NntpListEntry& obj)
|
SerializableFile& operator>>(SerializableFile& in, NntpListEntry& obj)
|
||||||
{
|
{
|
||||||
|
std::uint8_t SOH{};
|
||||||
|
std::uint8_t STX{};
|
||||||
|
std::uint8_t ETX{};
|
||||||
|
std::uint8_t EOT{};
|
||||||
|
in >> SOH;
|
||||||
|
in >> STX;
|
||||||
|
if ((SOH != 1) || (STX != 2))
|
||||||
|
{
|
||||||
|
throw SerializeException(EBADMSG,
|
||||||
|
"Bad magic number in NNTP entry header."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
in >> obj.id;
|
||||||
|
in >> obj.lastIndexedArticle;
|
||||||
in >> obj.count;
|
in >> obj.count;
|
||||||
in >> obj.high;
|
in >> obj.high;
|
||||||
in >> obj.low;
|
in >> obj.low;
|
||||||
in >> obj.name;
|
in >> obj.name;
|
||||||
in >> obj.status;
|
in >> obj.status;
|
||||||
|
in >> ETX;
|
||||||
|
in >> EOT;
|
||||||
|
if ((ETX != 3) || (EOT != 4))
|
||||||
|
{
|
||||||
|
throw SerializeException(EBADMSG,
|
||||||
|
"Bad magic number in NNTP entry footer."
|
||||||
|
);
|
||||||
|
}
|
||||||
return in;
|
return in;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,31 @@
|
||||||
|
|
||||||
namespace usenetsearch {
|
namespace usenetsearch {
|
||||||
|
|
||||||
|
static std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
|
||||||
|
|
||||||
|
std::string CharToHex(const char c)
|
||||||
|
{
|
||||||
|
const int val = c;
|
||||||
|
std::ostringstream result;
|
||||||
|
result << std::setw(2) << std::setfill('0') << std::hex;
|
||||||
|
result << val;
|
||||||
|
return result.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string StringFromWideString(const std::wstring& input)
|
||||||
|
{
|
||||||
|
std::string result;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
result = conv.to_bytes(input);
|
||||||
|
}
|
||||||
|
catch(const std::range_error&)
|
||||||
|
{
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
std::string StringHash(const std::string& input)
|
std::string StringHash(const std::string& input)
|
||||||
{
|
{
|
||||||
unsigned char result[MD5_DIGEST_LENGTH];
|
unsigned char result[MD5_DIGEST_LENGTH];
|
||||||
|
@ -123,13 +148,18 @@ void StringTreeOperation(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string CharToHex(const char c)
|
std::wstring WideStringFromString(const std::string& input)
|
||||||
{
|
{
|
||||||
const int val = c;
|
std::wstring result;
|
||||||
std::ostringstream result;
|
try
|
||||||
result << std::setw(2) << std::setfill('0') << std::hex;
|
{
|
||||||
result << val;
|
result = conv.from_bytes(input);
|
||||||
return result.str();
|
}
|
||||||
|
catch(const std::range_error&)
|
||||||
|
{
|
||||||
|
return L"";
|
||||||
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace usenetsearch
|
} // namespace usenetsearch
|
||||||
|
|
|
@ -22,6 +22,7 @@
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
#include "usenetsearch/Application.h"
|
||||||
#include "usenetsearch/Except.h"
|
#include "usenetsearch/Except.h"
|
||||||
#include "usenetsearch/StringUtils.h"
|
#include "usenetsearch/StringUtils.h"
|
||||||
|
|
||||||
|
@ -31,6 +32,10 @@ namespace usenetsearch {
|
||||||
|
|
||||||
// UsenetClient class ----------------------------------------------------------
|
// UsenetClient class ----------------------------------------------------------
|
||||||
|
|
||||||
|
UsenetClient::UsenetClient(Application& app): m_app(app)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
void UsenetClient::Authenticate(
|
void UsenetClient::Authenticate(
|
||||||
const std::wstring& user,
|
const std::wstring& user,
|
||||||
const std::wstring& password)
|
const std::wstring& password)
|
||||||
|
@ -105,7 +110,7 @@ void UsenetClient::Group(const std::wstring& newsgroup)
|
||||||
{
|
{
|
||||||
throw UsenetClientException(
|
throw UsenetClientException(
|
||||||
response.code,
|
response.code,
|
||||||
"Error changing group to " + m_conv.to_bytes(newsgroup) + " : "
|
"Error changing group to " + StringFromWideString(newsgroup) + " : "
|
||||||
+ response.message
|
+ response.message
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -145,6 +150,86 @@ NntpHeader UsenetClient::Head(std::uint64_t articleID)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool UsenetClient::IsError(const NntpMessage& msg) const
|
||||||
|
{
|
||||||
|
if (msg.code >= 400) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<std::vector<NntpListEntry>> UsenetClient::List()
|
||||||
|
{
|
||||||
|
Write(L"LIST COUNTS\r\n");
|
||||||
|
/* In response, we should get a 215 response followed by the list of news
|
||||||
|
groups ending in a period on it's own line. */
|
||||||
|
const auto response = ReadLine();
|
||||||
|
if (IsError(response))
|
||||||
|
{
|
||||||
|
throw UsenetClientException(
|
||||||
|
response.code,
|
||||||
|
"Failed to fetch newsgroup list from server, "
|
||||||
|
+ std::string{"server responded with: "}
|
||||||
|
+ response.message
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const auto listStr = ReadUntil("\r\n.\r\n");
|
||||||
|
// parse the list.
|
||||||
|
auto lines = StringSplit(listStr, std::string{"\r\n"});
|
||||||
|
auto result = std::make_unique<std::vector<NntpListEntry>>();
|
||||||
|
if (lines.empty()) return result;
|
||||||
|
for (const auto& line: lines)
|
||||||
|
{
|
||||||
|
NntpListEntry entry;
|
||||||
|
const auto fields = StringSplit(line, std::string{" "});
|
||||||
|
if (fields.size() == 5)
|
||||||
|
{
|
||||||
|
entry.name = fields[0];
|
||||||
|
entry.high = std::stoul(fields[1]);
|
||||||
|
entry.low = std::stoul(fields[2]);
|
||||||
|
entry.count = std::stoul(fields[3]);
|
||||||
|
entry.status = fields[4];
|
||||||
|
entry.id = 0; // incremented by db when saving.
|
||||||
|
entry.lastIndexedArticle = 0;
|
||||||
|
if (m_app.GetFilter().ProcessNewsgroup(entry.name))
|
||||||
|
{
|
||||||
|
result->emplace_back(entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::unique_ptr<std::vector<std::uint64_t>> UsenetClient::ListGroup(
|
||||||
|
const std::wstring& newsGroup)
|
||||||
|
{
|
||||||
|
auto result = std::make_unique<std::vector<std::uint64_t>>();
|
||||||
|
if (!m_app.GetFilter().ProcessNewsgroup(StringFromWideString(newsGroup)))
|
||||||
|
{
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
Write(L"LISTGROUP " + newsGroup + L"\r\n");
|
||||||
|
/* In response, we should get a 211 response followed by the list of
|
||||||
|
article ID's ending in a period on it's own line. */
|
||||||
|
const auto response = ReadLine();
|
||||||
|
if (IsError(response))
|
||||||
|
{
|
||||||
|
throw UsenetClientException(
|
||||||
|
response.code,
|
||||||
|
"Failed to fetch newsgroup list from server, "
|
||||||
|
+ std::string{"server responded with: "}
|
||||||
|
+ response.message
|
||||||
|
);
|
||||||
|
}
|
||||||
|
const auto listStr = ReadUntil("\r\n.\r\n");
|
||||||
|
// parse the list.
|
||||||
|
auto lines = StringSplit(listStr, std::string{"\r\n"});
|
||||||
|
if (lines.empty()) return result;
|
||||||
|
for (const auto& line: lines)
|
||||||
|
{
|
||||||
|
result->emplace_back(stoul(StringTrim(line)));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
void UsenetClient::ProcessHeaders(
|
void UsenetClient::ProcessHeaders(
|
||||||
std::uint64_t startMessage,
|
std::uint64_t startMessage,
|
||||||
std::function<void(std::shared_ptr<NntpHeaders>)> processFn,
|
std::function<void(std::shared_ptr<NntpHeaders>)> processFn,
|
||||||
|
@ -201,77 +286,6 @@ void UsenetClient::ProcessHeaders(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UsenetClient::IsError(const NntpMessage& msg) const
|
|
||||||
{
|
|
||||||
if (msg.code >= 400) return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<std::vector<NntpListEntry>> UsenetClient::List()
|
|
||||||
{
|
|
||||||
Write(L"LIST COUNTS\r\n");
|
|
||||||
/* In response, we should get a 215 response followed by the list of news
|
|
||||||
groups ending in a period on it's own line. */
|
|
||||||
const auto response = ReadLine();
|
|
||||||
if (IsError(response))
|
|
||||||
{
|
|
||||||
throw UsenetClientException(
|
|
||||||
response.code,
|
|
||||||
"Failed to fetch newsgroup list from server, "
|
|
||||||
+ std::string{"server responded with: "}
|
|
||||||
+ response.message
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const auto listStr = ReadUntil("\r\n.\r\n");
|
|
||||||
// parse the list.
|
|
||||||
auto lines = StringSplit(listStr, std::string{"\r\n"});
|
|
||||||
auto result = std::make_unique<std::vector<NntpListEntry>>();
|
|
||||||
if (lines.empty()) return result;
|
|
||||||
for (const auto& line: lines)
|
|
||||||
{
|
|
||||||
NntpListEntry entry;
|
|
||||||
const auto fields = StringSplit(line, std::string{" "});
|
|
||||||
if (fields.size() == 5)
|
|
||||||
{
|
|
||||||
entry.name = fields[0];
|
|
||||||
entry.high = std::stoul(fields[1]);
|
|
||||||
entry.low = std::stoul(fields[2]);
|
|
||||||
entry.count = std::stoul(fields[3]);
|
|
||||||
entry.status = fields[4];
|
|
||||||
result->emplace_back(entry);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unique_ptr<std::vector<std::uint64_t>> UsenetClient::ListGroup(
|
|
||||||
const std::wstring& newsGroup)
|
|
||||||
{
|
|
||||||
Write(L"LISTGROUP " + newsGroup + L"\r\n");
|
|
||||||
/* In response, we should get a 211 response followed by the list of
|
|
||||||
article ID's ending in a period on it's own line. */
|
|
||||||
const auto response = ReadLine();
|
|
||||||
if (IsError(response))
|
|
||||||
{
|
|
||||||
throw UsenetClientException(
|
|
||||||
response.code,
|
|
||||||
"Failed to fetch newsgroup list from server, "
|
|
||||||
+ std::string{"server responded with: "}
|
|
||||||
+ response.message
|
|
||||||
);
|
|
||||||
}
|
|
||||||
const auto listStr = ReadUntil("\r\n.\r\n");
|
|
||||||
// parse the list.
|
|
||||||
auto lines = StringSplit(listStr, std::string{"\r\n"});
|
|
||||||
auto result = std::make_unique<std::vector<std::uint64_t>>();
|
|
||||||
if (lines.empty()) return result;
|
|
||||||
for (const auto& line: lines)
|
|
||||||
{
|
|
||||||
result->emplace_back(stoul(StringTrim(line)));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
NntpMessage UsenetClient::ReadLine()
|
NntpMessage UsenetClient::ReadLine()
|
||||||
{
|
{
|
||||||
NntpMessage result{};
|
NntpMessage result{};
|
||||||
|
@ -306,7 +320,7 @@ std::string UsenetClient::ReadUntil(const std::string& deliminator)
|
||||||
|
|
||||||
void UsenetClient::Write(const std::wstring& message)
|
void UsenetClient::Write(const std::wstring& message)
|
||||||
{
|
{
|
||||||
const std::string toSend = m_conv.to_bytes(message);
|
const std::string toSend = StringFromWideString(message);
|
||||||
if (m_useSSL)
|
if (m_useSSL)
|
||||||
{
|
{
|
||||||
m_ssl->Write(toSend);
|
m_ssl->Write(toSend);
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
/*
|
||||||
|
Copyright© 2021 John Sennesael
|
||||||
|
|
||||||
|
UsenetSearch is Free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
UsenetSearch is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "usenetsearch/Application.h"
|
||||||
|
#include "usenetsearch/StringUtils.h"
|
||||||
|
|
||||||
|
using namespace usenetsearch;
|
||||||
|
|
||||||
|
int main(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
Application app;
|
||||||
|
std::string tokenFile{""};
|
||||||
|
std::string newsgroupFile{""};
|
||||||
|
app.AddFileOption(
|
||||||
|
't',
|
||||||
|
"token db file to dump.",
|
||||||
|
[&tokenFile](const std::string& val)
|
||||||
|
{
|
||||||
|
tokenFile = val;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
app.AddFileOption(
|
||||||
|
'n',
|
||||||
|
"newsgroup file to dump.",
|
||||||
|
[&newsgroupFile](const std::string& val)
|
||||||
|
{
|
||||||
|
newsgroupFile = val;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
if (!app.Init(argc, argv)) return 1;
|
||||||
|
if (!tokenFile.empty())
|
||||||
|
{
|
||||||
|
app.GetDb().ParseTokenFile(tokenFile, [](const ArticleEntry& token){
|
||||||
|
std::cout << "Hash: " << HashBytesToString(token.hash) << " | "
|
||||||
|
<< "NewsgroupID: " << token.newsgroupID << " | "
|
||||||
|
<< "ArticleID: " << token.articleID << std::endl;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (!newsgroupFile.empty())
|
||||||
|
{
|
||||||
|
const auto groups = app.GetDb().LoadNewsgroupList();
|
||||||
|
for(const auto& group: *groups)
|
||||||
|
{
|
||||||
|
std::cout << std::left
|
||||||
|
<< std::setw(9) << "Id: " + std::to_string(group.id)
|
||||||
|
<< std::setw(3) << " | "
|
||||||
|
<< std::setw(27) << "LastIndexedMsgId: "
|
||||||
|
+ std::to_string(group.lastIndexedArticle)
|
||||||
|
<< std::setw(3) << " | "
|
||||||
|
<< std::setw(14) << "Count: " +
|
||||||
|
std::to_string(group.count)
|
||||||
|
<< std::setw(3) << " | "
|
||||||
|
<< std::setw(13) << "High: " + std::to_string(group.high)
|
||||||
|
<< std::setw(3) << " | "
|
||||||
|
<< std::setw(8) << "Low: " + std::to_string(group.low)
|
||||||
|
<< std::setw(3) << " | "
|
||||||
|
<< std::setw(9) << "Status: " + group.status
|
||||||
|
<< std::setw(3) << " | "
|
||||||
|
<< std::setw(group.name.size() + 5)
|
||||||
|
<< "Name: " + group.name
|
||||||
|
<< std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -1,45 +0,0 @@
|
||||||
/*
|
|
||||||
Copyright© 2021 John Sennesael
|
|
||||||
|
|
||||||
UsenetSearch is Free software: you can redistribute it and/or modify
|
|
||||||
it under the terms of the GNU General Public License as published by
|
|
||||||
the Free Software Foundation, either version 3 of the License, or
|
|
||||||
(at your option) any later version.
|
|
||||||
|
|
||||||
UsenetSearch is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License
|
|
||||||
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <iostream>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "usenetsearch/Application.h"
|
|
||||||
#include "usenetsearch/StringUtils.h"
|
|
||||||
|
|
||||||
using namespace usenetsearch;
|
|
||||||
|
|
||||||
int main(int argc, char* argv[])
|
|
||||||
{
|
|
||||||
Application app;
|
|
||||||
std::string dbFile{""};
|
|
||||||
app.AddFileOption(
|
|
||||||
'd',
|
|
||||||
"token db file to dump.",
|
|
||||||
[&dbFile](const std::string& val)
|
|
||||||
{
|
|
||||||
dbFile = val;
|
|
||||||
}
|
|
||||||
);
|
|
||||||
app.Init(argc, argv);
|
|
||||||
app.GetDb().ParseTokenFile(dbFile, [](const ArticleEntry& token){
|
|
||||||
std::cout << "Hash: " << HashBytesToString(token.hash) << " | "
|
|
||||||
<< "NewsgroupID: " << token.newsgroupID << " | "
|
|
||||||
<< "ArticleID: " << token.articleID << std::endl;
|
|
||||||
});
|
|
||||||
return 0;
|
|
||||||
}
|
|
|
@ -1,6 +1,8 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
#include "usenetsearch/Application.h"
|
#include "usenetsearch/Application.h"
|
||||||
|
#include "usenetsearch/UsenetClient.h"
|
||||||
|
#include "usenetsearch/Indexer.h"
|
||||||
|
|
||||||
using namespace usenetsearch;
|
using namespace usenetsearch;
|
||||||
|
|
||||||
|
@ -13,15 +15,41 @@ int main(int argc, char* argv[])
|
||||||
searchString = s;
|
searchString = s;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
app.Init(argc, argv);
|
int maxResults{0};
|
||||||
|
app.AddIntegerOption('n', "Maximum results",
|
||||||
|
[&maxResults](int n){
|
||||||
|
maxResults = n;
|
||||||
|
}
|
||||||
|
);
|
||||||
|
if (!app.Init(argc, argv)) return 1;
|
||||||
if (searchString.empty())
|
if (searchString.empty())
|
||||||
{
|
{
|
||||||
std::cerr << "Missing search string." << std::endl;
|
std::cerr << "Missing search string." << std::endl;
|
||||||
app.Usage(argv[0]);
|
app.Usage(argv[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
searchString = app.GetFilter().ProcessSearchString(searchString);
|
UsenetClient client(app);
|
||||||
|
Indexer idx(app, client);
|
||||||
auto searchResults = app.GetDb().Search(searchString);
|
std::unique_ptr<SearchResults> results = idx.Search(
|
||||||
|
searchString
|
||||||
|
);
|
||||||
|
if (!results)
|
||||||
|
{
|
||||||
|
std::cout << "Nothing found." << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
size_t resultCounter{0};
|
||||||
|
for (const auto& sr: *results)
|
||||||
|
{
|
||||||
|
std::cout << std::left
|
||||||
|
<< std::setw(18) << "Newsgroup id: " + std::to_string(sr.NewsgroupId())
|
||||||
|
<< std::setw(4) << " | "
|
||||||
|
<< std::setw(17) << "Article id: " + std::to_string(sr.ArticleId())
|
||||||
|
<< std::setw(4) << " | "
|
||||||
|
<< std::setw(10) << "Hits: " + std::to_string(sr.Hits())
|
||||||
|
<< std::endl;
|
||||||
|
resultCounter++;
|
||||||
|
if ((maxResults > 0) && (resultCounter >= maxResults)) break;
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,53 +27,24 @@ using namespace usenetsearch;
|
||||||
int main(int argc, char* argv[])
|
int main(int argc, char* argv[])
|
||||||
{
|
{
|
||||||
Application app;
|
Application app;
|
||||||
app.Init(argc, argv);
|
if (!app.Init(argc, argv)) return 1;
|
||||||
|
|
||||||
UsenetClient client;
|
|
||||||
|
|
||||||
|
UsenetClient client(app);
|
||||||
Indexer indexer(app, client);
|
Indexer indexer(app, client);
|
||||||
|
std::cout << "Connecting to newsgroup server...";
|
||||||
indexer.Connect();
|
indexer.Connect();
|
||||||
|
std::cout << "<OK>" << std::endl;
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
// BEGIN TEMPORARY TEST CODE
|
std::cout << "Getting newsgroup list...";
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
|
std::cout.flush();
|
||||||
std::unique_ptr<std::vector<NntpListEntry>> list;
|
auto list = client.List();
|
||||||
NntpListEntry e{};
|
app.GetDb().UpdateNewsgroupList(*list);
|
||||||
e.count = 100;
|
std::cout << "<DONE>" << std::endl;
|
||||||
|
std::cout.flush();
|
||||||
// 1001 headers
|
std::cout << "Found " << list->size() << " newsgroups." << std::endl;
|
||||||
// e.name = "comp.os.os2.comm";
|
|
||||||
|
|
||||||
// 2541 headers
|
|
||||||
// e.name = "borland.public.cppbuilder.commandlinetools";
|
|
||||||
|
|
||||||
// 100026 headers (1859952 K) (1816.35 M)
|
|
||||||
// e.name = "dk.videnskab";
|
|
||||||
// 1000437 headers
|
|
||||||
e.name = "alt.bible";
|
|
||||||
|
|
||||||
// a million or so, but this one is very slow because all subjects look the
|
|
||||||
// same, so everything goes to the same token index, which means we're
|
|
||||||
// constantly waiting on a file lock.
|
|
||||||
// e.name = "usenetserver.test";
|
|
||||||
|
|
||||||
list = std::make_unique<std::vector<NntpListEntry>>();
|
|
||||||
list->emplace_back(e);
|
|
||||||
if ((list == nullptr) || (list->empty()))
|
|
||||||
{
|
|
||||||
std::cout << "Getting newsgroup list...";
|
|
||||||
std::cout.flush();
|
|
||||||
list = client.List();
|
|
||||||
app.GetDb().UpdateNewsgroupList(*list);
|
|
||||||
std::cout << "DONE." << std::endl;
|
|
||||||
std::cout.flush();
|
|
||||||
}
|
|
||||||
std::cout << "Number of newsgroups in newsgroup: "
|
|
||||||
<< list->size() << std::endl;
|
|
||||||
std::cout.flush();
|
std::cout.flush();
|
||||||
// END TEMPORARY TEST CODE
|
|
||||||
indexer.Index(*list);
|
indexer.Index(*list);
|
||||||
}
|
}
|
||||||
catch (const UsenetSearchException& e)
|
catch (const UsenetSearchException& e)
|
||||||
|
|
|
@ -38,8 +38,30 @@ max_tree_depth: 10
|
||||||
# The higher your tree max_tree_depth, the more likely you'll need to increase
|
# The higher your tree max_tree_depth, the more likely you'll need to increase
|
||||||
# this.
|
# this.
|
||||||
|
|
||||||
max_threads: 8
|
max_threads: 16
|
||||||
batch_size: 1000
|
batch_size: 10000
|
||||||
|
|
||||||
|
#############################
|
||||||
|
# Newsgroup filter settings #
|
||||||
|
#############################
|
||||||
|
|
||||||
|
# List one or more newsgroup regular expressions to include or exclude from
|
||||||
|
# being indexed. Blacklisted patterns take precedence over whitelisted patterns.
|
||||||
|
# These options may be repeated to include additional blacklist/whitelist
|
||||||
|
# regular expressions.
|
||||||
|
|
||||||
|
# If filter_newsgroup_whitelist is set, only newsgroups matching the configured
|
||||||
|
# regular expressions will be included in indexing.
|
||||||
|
# If not set, all of usenet will be indexed (with the exeption of
|
||||||
|
# filter_newsgroup_blacklist groups)
|
||||||
|
|
||||||
|
filter_newsgroup_whitelist: ^alt\.bible$
|
||||||
|
filter_newsgroup_whitelist: ^borland\.public\.cppbuilder\.*
|
||||||
|
|
||||||
|
# filter_newsgroup_blacklist allows you to exclude newsgroups from being
|
||||||
|
# indexed, whether filter_newsgroup_whitlelist is set or not.
|
||||||
|
|
||||||
|
filter_newsgroup_blacklist: .*binaries.*
|
||||||
|
|
||||||
########################
|
########################
|
||||||
# Word filter settings #
|
# Word filter settings #
|
||||||
|
@ -55,21 +77,21 @@ batch_size: 1000
|
||||||
# List of strings is comma-separated and case-insensitive. Each subsequent
|
# List of strings is comma-separated and case-insensitive. Each subsequent
|
||||||
# option appends to the previously defined list.
|
# option appends to the previously defined list.
|
||||||
|
|
||||||
filter_erase_subtoken: a,about,actually,almost,also,although,always,am,an,and
|
# filter_erase_subtoken: the,by
|
||||||
filter_erase_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
|
|
||||||
filter_erase_subtoken: does,each,either,else,for,from,had,has,have,hence,how
|
|
||||||
filter_erase_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
|
|
||||||
filter_erase_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
|
|
||||||
filter_erase_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
|
|
||||||
filter_erase_subtoken: whoever,whose,why,will,with,within,without,would,yes
|
|
||||||
filter_erase_subtoken: yet,you,your
|
|
||||||
|
|
||||||
# This setting lets you list all tokens that will only be indexed on direct
|
# This setting lets you list all tokens that will only be indexed on direct
|
||||||
# (whole string) matches. Each token is comma-separated, and the configuration
|
# (whole string) matches. Each token is comma-separated, and the configuration
|
||||||
# option may be listed multiple times as well, each subsequent option appends to
|
# option may be listed multiple times as well, each subsequent option appends to
|
||||||
# the previously defined list. All tokens are case-insensitive.
|
# the previously defined list. All tokens are case-insensitive.
|
||||||
|
|
||||||
filter_no_subtoken: makes for,funny business
|
filter_no_subtoken: a,about,actually,almost,also,although,always,am,an,and
|
||||||
|
filter_no_subtoken: any,are,as,at,be,became,become,but,by,can,could,did,do
|
||||||
|
filter_no_subtoken: does,each,either,else,for,from,had,has,have,hence,how
|
||||||
|
filter_no_subtoken: i,if,in,is,it,its,just,may,maybe,me,might,mine,must,my
|
||||||
|
filter_no_subtoken: mine,must,my,neither,nor,not,of,oh,ok,the,to,when,where
|
||||||
|
filter_no_subtoken: whereas,wherever,whenever,whether,which,while,who,whom
|
||||||
|
filter_no_subtoken: whoever,whose,why,will,with,within,without,would,yes
|
||||||
|
filter_no_subtoken: yet,you,your
|
||||||
|
|
||||||
# Sets the minimum number of words in a sub-token. You may use this if you don't
|
# Sets the minimum number of words in a sub-token. You may use this if you don't
|
||||||
# want to index single-words unless they are a direct match to the subject (in
|
# want to index single-words unless they are a direct match to the subject (in
|
||||||
|
|
Loading…
Reference in New Issue