UsenetSearch/src/Database.cpp

463 lines
13 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include "usenetsearch/Database.h"
#include "usenetsearch/Application.h"
#include "usenetsearch/Logger.h"
#include "usenetsearch/StringUtils.h"
#include "usenetsearch/ScopeExit.h"
#include "usenetsearch/Serialize.h"
#include "usenetsearch/UsenetClient.h"
#include <iomanip>
#include <chrono>
#include <filesystem>
#include <fstream>
#include <string>
#include <thread>
#include <vector>
namespace usenetsearch {
// Database class --------------------------------------------------------------
void Database::CheckDbVersion(const SerializableFile& f) const
{
f.Seek(0);
const std::uint64_t ver = f.ReadInt64();
if (ver != m_databaseVersion)
{
Logger::Get().Fatal<DatabaseException>(
LOGID("Database"),
"Wrong database version - Got: " + std::to_string(ver) + " want: "
+ std::to_string(m_databaseVersion)
);
}
}
std::unique_ptr<NntpListEntry> Database::FindNntpEntry(std::uint64_t id)
{
const auto path = GetNewsGroupFilePath();
if (!std::filesystem::exists(path)) return nullptr;
SerializableFile io;
io.Open(path);
CheckDbVersion(io);
const std::uint64_t numGroups = io.ReadInt64();
std::unique_ptr<NntpListEntry> result = nullptr;
for (std::uint64_t n = 0; n != numGroups; ++n)
{
NntpListEntry entry;
io >> entry;
if (entry.id == id)
{
result = std::make_unique<NntpListEntry>(entry);
break;
}
}
return result;
}
std::unique_ptr<NntpListEntry> Database::FindNntpEntry(
const std::string& subject)
{
const auto path = GetNewsGroupFilePath();
if (!std::filesystem::exists(path)) return nullptr;
SerializableFile io;
io.Open(path);
CheckDbVersion(io);
const std::uint64_t numGroups = io.ReadInt64();
std::unique_ptr<NntpListEntry> result = nullptr;
for (std::uint64_t n = 0; n != numGroups; ++n)
{
NntpListEntry entry;
io >> entry;
if (entry.name == subject)
{
result = std::make_unique<NntpListEntry>(entry);
break;
}
}
return result;
}
std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID)
{
const auto path = GetNewsGroupFilePath();
if (!std::filesystem::exists(path))
{
Logger::Get().Fatal<DatabaseException>(
LOGID("Database"),
"No indexed articles for newsgroup: "
+ std::to_string(newsgroupID)
);
}
SerializableFile io;
io.Open(path);
CheckDbVersion(io);
const std::uint64_t numGroups = io.ReadInt64();
for (std::uint64_t n = 0; n != numGroups; ++n)
{
NntpListEntry entry;
io >> entry;
if (entry.id == newsgroupID)
{
return entry.lastIndexedArticle;
}
}
Logger::Get().Fatal<DatabaseException>(
LOGID("Database"),
"No indexed articles for newsgroup: " + std::to_string(newsgroupID)
);
return NntpListEntry::NOT_INDEXED;
}
std::filesystem::path Database::GetTokenFilePath(
const std::string& token,
bool mkdirs
)
{
const std::string md5 = StringHash(token);
const std::string tok1 = md5.substr(0, 2);
const std::string tok2 = md5.substr(2, 1);
const std::string tok3 = md5.substr(3, 2);
const std::filesystem::path groupPath = m_databasePath / "tokens" / tok1
/ tok2;
if (mkdirs)
{
if (!std::filesystem::exists(groupPath))
{
std::filesystem::create_directories(groupPath);
}
}
const auto groupFile = tok3 + ".db";
return groupPath / groupFile;
}
std::uint64_t Database::GetUniqueNntpEntryId(
const std::vector<NntpListEntry>& list) const
{
std::uint64_t result{0};
for (auto& entry: list)
{
if (result <= entry.id)
{
result = entry.id + 1;
}
}
return result;
}
void Database::MaxTreeDepth(std::uint8_t depth)
{
m_maxTreeDepth = depth;
}
std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
{
auto result = std::make_unique<std::vector<NntpListEntry>>();
const auto path = GetNewsGroupFilePath();
if (!std::filesystem::exists(path)) return result;
SerializableFile io;
io.Open(path);
CheckDbVersion(io);
const size_t newsGroupCount = io.ReadInt64();
for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded)
{
NntpListEntry entry;
io >> entry;
result->emplace_back(entry);
}
return result;
}
void Database::Open(std::filesystem::path dbPath)
{
m_databasePath = dbPath;
if (!std::filesystem::exists(dbPath))
{
std::filesystem::create_directory(dbPath);
}
}
std::filesystem::path Database::GetNewsGroupFilePath() const
{
return m_databasePath / "newsgroups.db";
}
void Database::ParseTokenFile(
const std::filesystem::path& dbFile,
std::function<void(const ArticleEntry& entry)> onParse)
{
if (!std::filesystem::exists(dbFile))
{
Logger::Get().Fatal<DatabaseException>(
LOGID("Database"),
"File does not exist: " + dbFile.string()
);
}
SerializableFile io;
io.Open(dbFile);
const std::uint64_t tokenCount = io.ReadInt64();
for (std::uint64_t i = 0; i != tokenCount; ++i)
{
if (Application::Get().ShouldStop()) return;
ArticleEntry token;
io >> token;
onParse(token);
}
}
void Database::SetLastIndexedArticle(
std::uint64_t newsgroupID,
std::int32_t articleID)
{
auto outItems = LoadNewsgroupList();
bool found{false};
if (outItems)
{
for (auto& entry: *outItems)
{
if (entry.id == newsgroupID)
{
entry.lastIndexedArticle = articleID;
found = true;
break;
}
}
}
if (!found)
{
Logger::Get().Fatal<DatabaseException>(
LOGID("Database"),
"Attempt to update newsgroup not found in database - id: "
+ std::to_string(newsgroupID)
);
}
UpdateNewsgroupList(*outItems);
}
void Database::SaveSearchTokens(
std::uint64_t newsgroupID,
std::uint64_t articleID,
const std::string& searchString)
{
const std::string sstr(searchString);
StringTreeOperation(
sstr,
" ",
m_maxTreeDepth,
[&](const std::string& subToken, const std::string& str){
const std::string tok = Application::Get().GetFilter().ProcessToken(
subToken,
str
);
if (tok.empty()) return;
SaveToken(tok, newsgroupID, articleID);
}
);
}
bool Database::HasToken(
const std::string& subtoken,
std::uint64_t newsgroupID,
std::uint32_t articleID)
{
if (subtoken.empty()) return false;
const std::filesystem::path path = GetTokenFilePath(subtoken, true);
if (!std::filesystem::exists(path)) return false;
SerializableFile io;
io.Open(path);
const std::uint64_t tokenCount = io.ReadInt64();
for (std::uint64_t i = 0; i != tokenCount; ++i)
{
ArticleEntry token;
io >> token;
if (token.newsgroupID == newsgroupID)
{
if (token.articleID == articleID) return true;
}
}
return false;
}
std::unique_ptr<std::vector<ArticleEntry>> Database::LoadTokens(
const std::filesystem::path dbFile,
const std::string& subtoken)
{
auto result = std::make_unique<std::vector<ArticleEntry>>();
if (!std::filesystem::exists(dbFile)) return result;
SerializableFile io;
io.Open(dbFile);
const std::uint64_t tokenCount = io.ReadInt64();
const auto tokenHash = StringHashBytes(subtoken);
for (std::uint64_t ntok = 0; ntok != tokenCount; ++ntok)
{
ArticleEntry entry{};
io >> entry;
if (entry.hash != tokenHash) continue;
result->emplace_back(entry);
}
return result;
}
void Database::SaveToken(
const std::string& subtoken,
std::uint64_t newsgroupID,
std::uint32_t articleID)
{
if (subtoken.empty()) return;
const std::filesystem::path path = GetTokenFilePath(subtoken, true);
const bool exists = std::filesystem::exists(path);
SerializableFile io;
io.Open(path);
ArticleEntry token{};
token.articleID = articleID;
token.newsgroupID = newsgroupID;
token.hash = StringHashBytes(subtoken);
if (exists)
{
// Read token count and increment it by one, and write it back.
const std::uint64_t tokenCount = io.ReadInt64() + 1;
io.Seek(0, std::ios_base::beg);
io << tokenCount;
// Now seek back to the end of the file to append our token entry.
io.Seek(0, std::ios_base::end);
}
else
{
// A new file just has a token count of 1 - should already be at pos=0.
io << std::uint64_t{1};
}
// write out token.
io << token;
}
std::unique_ptr<std::vector<ArticleEntry>> Database::Search(
const std::string& searchString)
{
auto result = std::make_unique<std::vector<ArticleEntry>>();
// Tokenize the search string.
std::vector<std::string> searchTokens;
StringTreeOperation(
searchString,
" ",
m_maxTreeDepth,
[&searchTokens](const std::string& subToken, const std::string&){
searchTokens.emplace_back(subToken);
}
);
for (const auto& searchToken: searchTokens)
{
const auto path = GetTokenFilePath(searchToken, false);
const bool exists = std::filesystem::exists(path);
if (!exists) continue;
const auto foundTokens = LoadTokens(path, searchToken);
if (foundTokens->empty()) continue;
result->insert(result->end(), foundTokens->begin(), foundTokens->end());
}
return result;
}
void Database::SyncLastUpdated(
NntpListEntry& a,
NntpListEntry& b
)
{
// If both are equal, nothing to do here.
if (a.lastIndexedArticle == b.lastIndexedArticle) return;
// Whichever one's not indexed, gets the value of the other.
if (a.lastIndexedArticle == NntpListEntry::NOT_INDEXED)
{
a.lastIndexedArticle = b.lastIndexedArticle;
return;
}
if (b.lastIndexedArticle == NntpListEntry::NOT_INDEXED)
{
b.lastIndexedArticle = a.lastIndexedArticle;
return;
}
// Otherwise, whichever's higher wins.
if (a.lastIndexedArticle > b.lastIndexedArticle)
{
b.lastIndexedArticle = a.lastIndexedArticle;
}
else
{
a.lastIndexedArticle = b.lastIndexedArticle;
}
}
void Database::UpdateNewsgroupList(std::vector<NntpListEntry>& list)
{
if (list.size() == 0) return;
auto outList = LoadNewsgroupList();
for (auto& entry: list)
{
bool found{false};
if (outList)
{
std::for_each(
outList->begin(),
outList->end(),
[this, &entry, &found](NntpListEntry& oldEntry)
{
if (oldEntry.name == entry.name)
{
// update existing- the passed entry has updated counts
// and status.
found = true;
oldEntry.count = entry.count;
oldEntry.high = entry.high;
oldEntry.low = entry.low;
oldEntry.status = entry.status;
// As for lastIndexed: whoemever's higher wins.
SyncLastUpdated(entry, oldEntry);
// The ID should be sticky - whatever's already in the
// db is what we care about, so update it in the passed
// entries.
entry.id = oldEntry.id;
}
}
);
}
if (found) continue;
// add new.
NntpListEntry newEntry(entry);
newEntry.id = GetUniqueNntpEntryId(*outList);
outList->emplace_back(newEntry);
entry.id = newEntry.id;
}
SerializableFile io;
io.Open(GetNewsGroupFilePath());
io << m_databaseVersion;
io << std::uint64_t{outList->size()};
std::for_each(
outList->begin(),
outList->end(),
[&](const NntpListEntry& e)
{
io << e;
}
);
}
} // namespace usenetsearch