UsenetSearch/src/Database.cpp

462 lines
12 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include <iostream>
#include <chrono>
#include <filesystem>
#include <fstream>
#include <string>
#include <thread>
#include <vector>
#include "usenetsearch/StringUtils.h"
#include "usenetsearch/UsenetClient.h"
#include "usenetsearch/ScopeExit.h"
#include "usenetsearch/Serialize.h"
#include "usenetsearch/Database.h"
namespace usenetsearch {
size_t ArticleEntry::Size() const
{
return sizeof(articleIDs) + (articleIDs.size() * sizeof(std::uint64_t))
+ sizeof(std::uint64_t) + (searchString.size() * sizeof(char));
}
// Database class --------------------------------------------------------------
Database::~Database()
{
if (m_newsGroupFileIO.is_open())
{
m_newsGroupFileIO.close();
}
}
std::filesystem::path Database::GetArticleFilePath(
const std::wstring& newsgroup, bool mkdirs)
{
const std::string md5 = StringHash(m_conv.to_bytes(newsgroup));
const std::string tok1 = md5.substr(0, 2);
const std::string tok2 = md5.substr(2, 2);
const std::string tok3 = md5.substr(4, 2);
const std::filesystem::path groupPath = m_databasePath / "articles"
/ tok1 / tok2 / tok3;
if (mkdirs)
{
if (!std::filesystem::exists(groupPath))
{
std::filesystem::create_directories(groupPath);
}
}
const auto groupFile = md5 + ".db";
return groupPath / groupFile;
}
bool Database::GetArticleEntry(
const std::string& subToken,
const std::string& searchString,
ArticleEntry& entry,
size_t& startPosition,
size_t& endPosition,
size_t& count)
{
const auto path = GetTokenFilePath(subToken);
if (!std::filesystem::exists(path)) return false;
SerializableFile io;
LockFile(path);
ScopeExit closeAndUnlockFile([&](){
if (io.is_open()) io.close();
UnlockFile(path);
});
io.open(path, std::ios::binary | std::ios::in);
if (!io.is_open()) return false;
std::uint64_t articleCount{0};
io >> articleCount;
size_t startPos{0};
size_t endPos{0};
for (std::uint64_t i = 0; i != articleCount; ++i)
{
ArticleEntry curEntry{};
startPos = io.tellg();
io >> curEntry;
endPos = io.tellg();
if (curEntry.searchString == searchString)
{
entry = curEntry;
startPosition = startPos;
endPosition = endPos;
count = i + 1;
return true;
}
}
return false;
}
std::filesystem::path Database::GetTokenFilePath(
const std::string& token,
bool mkdirs
)
{
const std::string md5 = StringHash(token);
const std::string tok1 = md5.substr(0, 2);
const std::string tok2 = md5.substr(2, 2);
const std::string tok3 = md5.substr(4, 2);
const std::filesystem::path groupPath = m_databasePath / "tokens"
/ tok1 / tok2 / tok3;
if (mkdirs)
{
if (!std::filesystem::exists(groupPath))
{
std::filesystem::create_directories(groupPath);
}
}
const auto groupFile = md5 + ".db";
return groupPath / groupFile;
}
bool Database::HasToken(
const std::string& subToken,
const std::string& subject,
std::uint64_t articleID)
{
const auto path = GetTokenFilePath(subToken);
if (!std::filesystem::exists(path)) return false;
LockFile(path);
ScopeExit unlockFile([&](){ UnlockFile(path); });
SerializableFile io;
io.open(path, std::ios::binary | std::ios::in);
std::uint64_t articleCount{0};
io.read(reinterpret_cast<char*>(&articleCount), sizeof(articleCount));
std::uint64_t c = 0;
for (std::uint64_t i = 0; i != c; ++i)
{
ArticleEntry entry;
io >> entry;
if (entry.searchString == subject)
{
if (std::find(
entry.articleIDs.begin(),
entry.articleIDs.end(), articleID) != entry.articleIDs.end())
{
return true;
}
}
}
return false;
}
void Database::LockFile(std::filesystem::path file)
{
#if 1
while (true)
{
{
std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
auto it = std::find(m_lockedFiles.begin(), m_lockedFiles.end(), file);
if (it == m_lockedFiles.end()) break;
}
//std::cout << "Waiting on lock: " << file.string() << std::endl;
//std::this_thread::sleep_for(std::chrono::milliseconds{1000});
std::this_thread::sleep_for(std::chrono::milliseconds{20});
}
{
std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
m_lockedFiles.emplace_back(file);
}
#else
const std::filesystem::path lockFilePath = file.string() + ".lock";
while (true)
{
if (!std::filesystem::exists(lockFilePath))
{
break;
}
std::this_thread::sleep_for(std::chrono::milliseconds{20});
}
std::ofstream touch(lockFilePath);
#endif
}
void Database::UnlockFile(std::filesystem::path file)
{
#if 1
std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
auto it = std::find(m_lockedFiles.begin(), m_lockedFiles.end(), file);
if (it != m_lockedFiles.end())
{
m_lockedFiles.erase(it);
}
#else
const std::filesystem::path lockFilePath = file.string() + ".lock";
if (std::filesystem::exists(lockFilePath))
{
std::filesystem::remove(lockFilePath);
}
#endif
}
std::unique_ptr<std::vector<NntpHeader>> Database::LoadArticleList(
const std::wstring& newsgroup)
{
const auto articleFile = GetArticleFilePath(newsgroup);
if (!std::filesystem::exists(articleFile))
{
throw DatabaseException(ENOTFOUND,
"No article list found for newsgroup " + m_conv.to_bytes(newsgroup)
);
}
LockFile(articleFile);
ScopeExit unlockFile([&](){ UnlockFile(articleFile); });
SerializableFile io;
io.open(articleFile, std::ios::binary | std::ios::in);
std::uint64_t articleCount;
io.read(
reinterpret_cast<char*>(&articleCount),
sizeof(articleCount)
);
auto result = std::make_unique<std::vector<NntpHeader>>();
for (std::uint64_t i = 0; i != articleCount; ++i)
{
NntpHeader header;
io >> header;
result->emplace_back(header);
}
io.close();
return result;
}
std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
{
OpenNewsGroupFile();
std::uint64_t dbVersion{0};
m_newsGroupFileIO.read(
reinterpret_cast<char*>(&dbVersion),
sizeof(dbVersion)
);
if (dbVersion != m_databaseVersion)
{
throw DatabaseException(EINVAL,
"The loaded database version (" + std::to_string(dbVersion)
+ ") does not match the current database version ("
+ std::to_string(m_databaseVersion) + ")");
}
size_t newsGroupCount{0};
m_newsGroupFileIO.read(
reinterpret_cast<char*>(&newsGroupCount),
sizeof(newsGroupCount)
);
auto result = std::make_unique<std::vector<NntpListEntry>>();
for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded)
{
NntpListEntry entry;
m_newsGroupFileIO >> entry;
result->emplace_back(entry);
}
return result;
}
void Database::Open(std::filesystem::path dbPath)
{
m_databasePath = dbPath;
if (!std::filesystem::exists(dbPath))
{
std::filesystem::create_directory(dbPath);
}
OpenNewsGroupFile();
}
void Database::OpenNewsGroupFile()
{
if (m_newsGroupFileIO.is_open() && m_newsGroupFileIO.is_open())
{
return;
}
const std::filesystem::path newsGroupFilePath =
m_databasePath / "newsgroups.db";
if (!m_newsGroupFileIO.is_open())
{
m_newsGroupFileIO.open(newsGroupFilePath,
std::ios::binary | std::ios::in | std::ios::out);
}
}
void Database::UpdateArticleList(
const std::wstring& newsgroup,
const std::vector<NntpHeader>& headers)
{
const auto articleFile = GetArticleFilePath(newsgroup, true);
LockFile(articleFile);
ScopeExit unlockFile([&](){
UnlockFile(articleFile);
});
SerializableFile io;
io.open(articleFile, std::ios::binary | std::ios::out);
const std::uint64_t articleCount = headers.size();
io << articleCount;
for (const auto& header: headers)
{
io << header;
}
io.close();
}
void Database::UpdateNewsgroupList(const std::vector<NntpListEntry>& list)
{
OpenNewsGroupFile();
m_newsGroupFileIO.write(
reinterpret_cast<const char*>(&m_databaseVersion),
sizeof(m_databaseVersion)
);
const std::uint64_t newsGroupCount = list.size();
m_newsGroupFileIO << newsGroupCount;
for (const auto& entry: list)
{
m_newsGroupFileIO << entry;
}
m_newsGroupFileIO.flush();
}
void Database::SaveSearchTokens(
std::uint64_t articleID,
const std::string& searchString)
{
const std::string sstr(searchString);
StringTreeOperation(
sstr,
" ",
m_maxTreeDepth,
[&](const std::string& subToken, const std::string& str){
try
{
SaveToken(subToken, str, articleID);
}
catch (const SerializeException& e)
{
/// @todo do graceful magic here.
std::cout << "Broken file for \"" << subToken << "\"" << std::endl;
}
}
);
}
void Database::SaveToken(
const std::string& subtoken,
const std::string& subject,
std::uint64_t articleID)
{
if (subtoken == "") return;
if (subtoken == " ") return;
if (subject == "") return;
if (subject == " ") return;
if (HasToken(subtoken, subject, articleID)) return;
const auto path = GetTokenFilePath(subtoken, true);
size_t startPos{0};
size_t endPos{0};
size_t count{0};
ArticleEntry entry{};
bool found = GetArticleEntry(
subtoken, subject, entry, startPos, endPos, count);
std::uint64_t entryCount{0};
// Update existing?
if (!std::filesystem::exists(path))
{
// Creating a new token file is pretty simple.
entryCount = 1;
SerializableFile io;
LockFile(path);
ScopeExit unlockFile([&](){
if (io.is_open()) io.close();
UnlockFile(path);
});
io.open(path, std::ios::binary | std::ios::out);
io << entryCount;
entry.searchString = subject;
entry.articleIDs.emplace_back(articleID);
io << entry;
}
else
{
// Updating an existing token file is a bit more complicated.
// See if the search string already exists in the token file.
// Open file.
SerializableFile io;
io.open(path, std::ios::binary | std::ios::in | std::ios::out);
LockFile(path);
ScopeExit unlockFile([&](){
if (io.is_open()) io.close();
UnlockFile(path);
});
// Read entry count.
io >> entryCount;
if (found)
{
/* Seek to the end of the entry and read all following entries. */
io.seekg(endPos);
std::vector<ArticleEntry> entries;
for (auto i = count; i != entryCount; ++i)
{
ArticleEntry e{};
io >> e;
entries.emplace_back(e);
}
// Seek back to the start of the existing entry and rewrite it.
io.seekg(startPos);
io << entry;
// Now re-write all following entries.
for (auto& e: entries)
{
io << e;
}
}
else
{
// This is a new search string in the token file, append to the end.
io.seekg(0, std::ios_base::end);
entry.searchString = subject;
entry.articleIDs = {articleID};
io << entry;
// Increment entry count.
entryCount++;
io.seekg(0);
io << entryCount;
}
}
}
} // namespace usenetsearch