UsenetSearch/src/Database.cpp

416 lines
12 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include <iomanip>
#include <iostream>
#include <chrono>
#include <filesystem>
#include <fstream>
#include <string>
#include <thread>
#include <vector>
#include "usenetsearch/Application.h"
#include "usenetsearch/StringUtils.h"
#include "usenetsearch/UsenetClient.h"
#include "usenetsearch/ScopeExit.h"
#include "usenetsearch/Serialize.h"
#include "usenetsearch/Database.h"
namespace usenetsearch {
// Database class --------------------------------------------------------------
Database::Database(Application& app): m_app(app)
{
}
Database::~Database()
{
m_newsGroupFileIO.Close();
}
std::unique_ptr<NntpListEntry> Database::FindNntpEntry(
const std::string& subject)
{
OpenNewsGroupFile();
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64();
std::unique_ptr<NntpListEntry> result = nullptr;
for (std::uint64_t n = 0; n != numGroups; ++n)
{
NntpListEntry entry;
m_newsGroupFileIO >> entry;
if (entry.name == subject)
{
result = std::make_unique<NntpListEntry>(entry);
break;
}
}
return result;
}
std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID)
{
OpenNewsGroupFile();
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64();
for (std::uint64_t n = 0; n != numGroups; ++n)
{
NntpListEntry entry;
m_newsGroupFileIO >> entry;
if (entry.id == newsgroupID)
{
return entry.lastIndexedArticle;
}
}
return 0;
}
std::filesystem::path Database::GetTokenFilePath(
const std::string& token,
bool mkdirs
)
{
const std::string md5 = StringHash(token);
const std::string tok1 = md5.substr(0, 2);
const std::string tok2 = md5.substr(2, 1);
const std::string tok3 = md5.substr(3, 2);
const std::filesystem::path groupPath = m_databasePath / "tokens" / tok1
/ tok2;
if (mkdirs)
{
if (!std::filesystem::exists(groupPath))
{
std::filesystem::create_directories(groupPath);
}
}
const auto groupFile = tok3 + ".db";
return groupPath / groupFile;
}
std::uint64_t Database::GetUniqueNntpEntryId(
const std::vector<NntpListEntry>& list) const
{
std::uint64_t result{0};
for (auto& entry: list)
{
if (result <= entry.id)
{
result = entry.id + 1;
}
}
return result;
}
void Database::MaxTreeDepth(std::uint8_t depth)
{
m_maxTreeDepth = depth;
}
std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
{
OpenNewsGroupFile();
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
const size_t newsGroupCount = m_newsGroupFileIO.ReadInt64();
auto result = std::make_unique<std::vector<NntpListEntry>>();
for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded)
{
NntpListEntry entry;
m_newsGroupFileIO >> entry;
result->emplace_back(entry);
}
return result;
}
void Database::Open(std::filesystem::path dbPath)
{
m_databasePath = dbPath;
if (!std::filesystem::exists(dbPath))
{
std::filesystem::create_directory(dbPath);
}
}
void Database::OpenNewsGroupFile()
{
if (m_newsGroupFileIO.IsOpen())
{
m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg);
return;
}
const std::filesystem::path newsGroupFilePath =
m_databasePath / "newsgroups.db";
const bool exists = std::filesystem::exists(newsGroupFilePath);
m_newsGroupFileIO.Open(newsGroupFilePath);
if (exists)
{
const std::uint64_t ver = m_newsGroupFileIO.ReadInt64();
if (ver != m_databaseVersion)
{
throw DatabaseException(EBADF,
std::string{"Mismatching newgroup file database version:"}
+ " have: " + std::to_string(ver) + " - want: "
+ std::to_string(m_databaseVersion)
);
}
}
else
{
m_newsGroupFileIO << m_databaseVersion;
m_newsGroupFileIO << std::uint64_t{0}; // newsgroup count.
m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg);
}
}
void Database::ParseTokenFile(
const std::filesystem::path& dbFile,
std::function<void(const ArticleEntry& entry)> onParse)
{
if (!std::filesystem::exists(dbFile))
{
throw DatabaseException(
ENOTFOUND,
"File does not exist: " + dbFile.string()
);
}
SerializableFile io;
io.Open(dbFile);
const std::uint64_t tokenCount = io.ReadInt64();
for (std::uint64_t i = 0; i != tokenCount; ++i)
{
ArticleEntry token;
io >> token;
onParse(token);
}
}
void Database::SetLastIndexedArticle(
std::uint64_t newsgroupID,
std::int32_t articleID)
{
auto outItems = LoadNewsgroupList();
bool found{false};
if (outItems)
{
for (auto& entry: *outItems)
{
if (entry.id == newsgroupID)
{
entry.lastIndexedArticle = articleID;
found = true;
}
}
}
if (!found)
{
throw DatabaseException(EINVAL,
"Attempt to update newsgroup not found in database - id: "
+ std::to_string(newsgroupID));
}
UpdateNewsgroupList(*outItems);
}
void Database::SaveSearchTokens(
std::uint64_t newsgroupID,
std::uint64_t articleID,
const std::string& searchString)
{
const std::string sstr(searchString);
StringTreeOperation(
sstr,
" ",
m_maxTreeDepth,
[&](const std::string& subToken, const std::string& str){
const std::string tok = m_app.GetFilter().ProcessToken(
subToken,
str
);
if (tok.empty()) return;
SaveToken(tok, newsgroupID, articleID);
}
);
}
bool Database::HasToken(
const std::string& subtoken,
std::uint64_t newsgroupID,
std::uint32_t articleID)
{
if (subtoken.empty()) return false;
const std::filesystem::path path = GetTokenFilePath(subtoken, true);
if (!std::filesystem::exists(path)) return false;
SerializableFile io;
io.Open(path);
const std::uint64_t tokenCount = io.ReadInt64();
for (std::uint64_t i = 0; i != tokenCount; ++i)
{
ArticleEntry token;
io >> token;
if (token.newsgroupID == newsgroupID)
{
if (token.articleID == articleID) return true;
}
}
return false;
}
std::unique_ptr<std::vector<ArticleEntry>> Database::LoadTokens(
const std::filesystem::path dbFile,
const std::string& subtoken)
{
auto result = std::make_unique<std::vector<ArticleEntry>>();
if (!std::filesystem::exists(dbFile)) return result;
SerializableFile io;
io.Open(dbFile);
const std::uint64_t tokenCount = io.ReadInt64();
const auto tokenHash = StringHashBytes(subtoken);
for (std::uint64_t ntok = 0; ntok != tokenCount; ++ntok)
{
ArticleEntry entry{};
io >> entry;
if (entry.hash != tokenHash) continue;
result->emplace_back(entry);
}
return result;
}
void Database::SaveToken(
const std::string& subtoken,
std::uint64_t newsgroupID,
std::uint32_t articleID)
{
if (subtoken.empty()) return;
const std::filesystem::path path = GetTokenFilePath(subtoken, true);
const bool exists = std::filesystem::exists(path);
SerializableFile io;
io.Open(path);
ArticleEntry token{};
token.articleID = articleID;
token.newsgroupID = newsgroupID;
token.hash = StringHashBytes(subtoken);
if (exists)
{
// Read token count and increment it by one, and write it back.
const std::uint64_t tokenCount = io.ReadInt64() + 1;
io.Seek(0, std::ios_base::beg);
io << tokenCount;
// Now seek back to the end of the file to append our token entry.
io.Seek(0, std::ios_base::end);
}
else
{
// A new file just has a token count of 1 - should already be at pos=0.
io << std::uint64_t{1};
}
// write out token.
#if 0
std::cout << "Token: " << subtoken << std::endl;
std::cout << "Saving into file: " << path << std::endl;
std::cout << "Token hash: " << HashBytesToString(token.hash) << std::endl << std::endl;
#endif
io << token;
}
std::unique_ptr<std::vector<ArticleEntry>> Database::Search(
const std::string& searchString)
{
auto result = std::make_unique<std::vector<ArticleEntry>>();
// Tokenize the search string.
std::vector<std::string> searchTokens;
StringTreeOperation(
searchString,
" ",
m_maxTreeDepth,
[&searchTokens](const std::string& subToken, const std::string&){
searchTokens.emplace_back(subToken);
}
);
for (const auto& searchToken: searchTokens)
{
const auto path = GetTokenFilePath(searchToken, false);
const bool exists = std::filesystem::exists(path);
if (!exists) continue;
const auto foundTokens = LoadTokens(path, searchToken);
if (foundTokens->empty()) continue;
result->insert(result->end(), foundTokens->begin(), foundTokens->end());
std::cout << std::left << std::setw(searchString.length() + 7)
<< "token: " + searchToken
<< std::setw(3) << " | "
<< std::setw(10)
<< "db file: " << path.string()
<< std::setw(3) << " | "
<< std::setw(9)
<< "#results: " + std::to_string(foundTokens->size())
<< std::endl;
}
return result;
}
void Database::UpdateNewsgroupList(std::vector<NntpListEntry>& list)
{
if (list.size() == 0) return;
auto outList = LoadNewsgroupList();
for (auto& entry: list)
{
NntpListEntry newEntry(entry);
bool found{false};
if (outList)
{
std::for_each(
outList->begin(),
outList->end(),
[&entry, &found](NntpListEntry& oldEntry)
{
if (oldEntry.name == entry.name)
{
// update existing (copy everything but ID & name)
found = true;
oldEntry.count = entry.count;
oldEntry.high = entry.high;
oldEntry.lastIndexedArticle = entry.lastIndexedArticle;
oldEntry.low = entry.low;
oldEntry.status = entry.status;
}
}
);
}
if (found) continue;
// add new.
newEntry.id = GetUniqueNntpEntryId(*outList);
outList->emplace_back(newEntry);
entry.id = newEntry.id;
}
OpenNewsGroupFile();
ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); });
m_newsGroupFileIO << std::uint64_t{outList->size()};
std::for_each(
outList->begin(),
outList->end(),
[&](const NntpListEntry& e)
{
m_newsGroupFileIO << e;
}
);
}
} // namespace usenetsearch