462 lines
12 KiB
C++
462 lines
12 KiB
C++
/*
|
|
Copyright© 2021 John Sennesael
|
|
|
|
UsenetSearch is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
UsenetSearch is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <iostream>
|
|
|
|
#include <chrono>
|
|
#include <filesystem>
|
|
#include <fstream>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "usenetsearch/StringUtils.h"
|
|
#include "usenetsearch/UsenetClient.h"
|
|
#include "usenetsearch/ScopeExit.h"
|
|
#include "usenetsearch/Serialize.h"
|
|
|
|
#include "usenetsearch/Database.h"
|
|
|
|
namespace usenetsearch {
|
|
|
|
size_t ArticleEntry::Size() const
|
|
{
|
|
return sizeof(articleIDs) + (articleIDs.size() * sizeof(std::uint64_t))
|
|
+ sizeof(std::uint64_t) + (searchString.size() * sizeof(char));
|
|
}
|
|
|
|
// Database class --------------------------------------------------------------
|
|
|
|
Database::~Database()
|
|
{
|
|
if (m_newsGroupFileIO.is_open())
|
|
{
|
|
m_newsGroupFileIO.close();
|
|
}
|
|
}
|
|
|
|
std::filesystem::path Database::GetArticleFilePath(
|
|
const std::wstring& newsgroup, bool mkdirs)
|
|
{
|
|
const std::string md5 = StringHash(m_conv.to_bytes(newsgroup));
|
|
const std::string tok1 = md5.substr(0, 2);
|
|
const std::string tok2 = md5.substr(2, 2);
|
|
const std::string tok3 = md5.substr(4, 2);
|
|
const std::filesystem::path groupPath = m_databasePath / "articles"
|
|
/ tok1 / tok2 / tok3;
|
|
if (mkdirs)
|
|
{
|
|
if (!std::filesystem::exists(groupPath))
|
|
{
|
|
std::filesystem::create_directories(groupPath);
|
|
}
|
|
}
|
|
const auto groupFile = md5 + ".db";
|
|
return groupPath / groupFile;
|
|
}
|
|
|
|
bool Database::GetArticleEntry(
|
|
const std::string& subToken,
|
|
const std::string& searchString,
|
|
ArticleEntry& entry,
|
|
size_t& startPosition,
|
|
size_t& endPosition,
|
|
size_t& count)
|
|
{
|
|
const auto path = GetTokenFilePath(subToken);
|
|
if (!std::filesystem::exists(path)) return false;
|
|
|
|
SerializableFile io;
|
|
|
|
LockFile(path);
|
|
ScopeExit closeAndUnlockFile([&](){
|
|
if (io.is_open()) io.close();
|
|
UnlockFile(path);
|
|
});
|
|
|
|
io.open(path, std::ios::binary | std::ios::in);
|
|
if (!io.is_open()) return false;
|
|
|
|
std::uint64_t articleCount{0};
|
|
io >> articleCount;
|
|
size_t startPos{0};
|
|
size_t endPos{0};
|
|
for (std::uint64_t i = 0; i != articleCount; ++i)
|
|
{
|
|
ArticleEntry curEntry{};
|
|
startPos = io.tellg();
|
|
io >> curEntry;
|
|
endPos = io.tellg();
|
|
if (curEntry.searchString == searchString)
|
|
{
|
|
entry = curEntry;
|
|
startPosition = startPos;
|
|
endPosition = endPos;
|
|
count = i + 1;
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
std::filesystem::path Database::GetTokenFilePath(
|
|
const std::string& token,
|
|
bool mkdirs
|
|
)
|
|
{
|
|
const std::string md5 = StringHash(token);
|
|
const std::string tok1 = md5.substr(0, 2);
|
|
const std::string tok2 = md5.substr(2, 2);
|
|
const std::string tok3 = md5.substr(4, 2);
|
|
const std::filesystem::path groupPath = m_databasePath / "tokens"
|
|
/ tok1 / tok2 / tok3;
|
|
if (mkdirs)
|
|
{
|
|
if (!std::filesystem::exists(groupPath))
|
|
{
|
|
std::filesystem::create_directories(groupPath);
|
|
}
|
|
}
|
|
const auto groupFile = md5 + ".db";
|
|
return groupPath / groupFile;
|
|
}
|
|
|
|
|
|
|
|
bool Database::HasToken(
|
|
const std::string& subToken,
|
|
const std::string& subject,
|
|
std::uint64_t articleID)
|
|
{
|
|
const auto path = GetTokenFilePath(subToken);
|
|
if (!std::filesystem::exists(path)) return false;
|
|
LockFile(path);
|
|
ScopeExit unlockFile([&](){ UnlockFile(path); });
|
|
SerializableFile io;
|
|
io.open(path, std::ios::binary | std::ios::in);
|
|
std::uint64_t articleCount{0};
|
|
io.read(reinterpret_cast<char*>(&articleCount), sizeof(articleCount));
|
|
std::uint64_t c = 0;
|
|
for (std::uint64_t i = 0; i != c; ++i)
|
|
{
|
|
ArticleEntry entry;
|
|
io >> entry;
|
|
if (entry.searchString == subject)
|
|
{
|
|
if (std::find(
|
|
entry.articleIDs.begin(),
|
|
entry.articleIDs.end(), articleID) != entry.articleIDs.end())
|
|
{
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Database::LockFile(std::filesystem::path file)
|
|
{
|
|
#if 1
|
|
while (true)
|
|
{
|
|
{
|
|
std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
|
|
auto it = std::find(m_lockedFiles.begin(), m_lockedFiles.end(), file);
|
|
if (it == m_lockedFiles.end()) break;
|
|
}
|
|
//std::cout << "Waiting on lock: " << file.string() << std::endl;
|
|
//std::this_thread::sleep_for(std::chrono::milliseconds{1000});
|
|
std::this_thread::sleep_for(std::chrono::milliseconds{20});
|
|
}
|
|
{
|
|
std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
|
|
m_lockedFiles.emplace_back(file);
|
|
}
|
|
#else
|
|
const std::filesystem::path lockFilePath = file.string() + ".lock";
|
|
while (true)
|
|
{
|
|
if (!std::filesystem::exists(lockFilePath))
|
|
{
|
|
break;
|
|
}
|
|
std::this_thread::sleep_for(std::chrono::milliseconds{20});
|
|
}
|
|
std::ofstream touch(lockFilePath);
|
|
#endif
|
|
}
|
|
|
|
void Database::UnlockFile(std::filesystem::path file)
|
|
{
|
|
#if 1
|
|
std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
|
|
auto it = std::find(m_lockedFiles.begin(), m_lockedFiles.end(), file);
|
|
if (it != m_lockedFiles.end())
|
|
{
|
|
m_lockedFiles.erase(it);
|
|
}
|
|
#else
|
|
const std::filesystem::path lockFilePath = file.string() + ".lock";
|
|
if (std::filesystem::exists(lockFilePath))
|
|
{
|
|
std::filesystem::remove(lockFilePath);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
std::unique_ptr<std::vector<NntpHeader>> Database::LoadArticleList(
|
|
const std::wstring& newsgroup)
|
|
{
|
|
const auto articleFile = GetArticleFilePath(newsgroup);
|
|
if (!std::filesystem::exists(articleFile))
|
|
{
|
|
throw DatabaseException(ENOTFOUND,
|
|
"No article list found for newsgroup " + m_conv.to_bytes(newsgroup)
|
|
);
|
|
}
|
|
|
|
LockFile(articleFile);
|
|
ScopeExit unlockFile([&](){ UnlockFile(articleFile); });
|
|
|
|
SerializableFile io;
|
|
io.open(articleFile, std::ios::binary | std::ios::in);
|
|
std::uint64_t articleCount;
|
|
io.read(
|
|
reinterpret_cast<char*>(&articleCount),
|
|
sizeof(articleCount)
|
|
);
|
|
auto result = std::make_unique<std::vector<NntpHeader>>();
|
|
for (std::uint64_t i = 0; i != articleCount; ++i)
|
|
{
|
|
NntpHeader header;
|
|
io >> header;
|
|
result->emplace_back(header);
|
|
}
|
|
io.close();
|
|
return result;
|
|
}
|
|
|
|
std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
|
|
{
|
|
OpenNewsGroupFile();
|
|
|
|
std::uint64_t dbVersion{0};
|
|
m_newsGroupFileIO.read(
|
|
reinterpret_cast<char*>(&dbVersion),
|
|
sizeof(dbVersion)
|
|
);
|
|
if (dbVersion != m_databaseVersion)
|
|
{
|
|
throw DatabaseException(EINVAL,
|
|
"The loaded database version (" + std::to_string(dbVersion)
|
|
+ ") does not match the current database version ("
|
|
+ std::to_string(m_databaseVersion) + ")");
|
|
}
|
|
|
|
size_t newsGroupCount{0};
|
|
m_newsGroupFileIO.read(
|
|
reinterpret_cast<char*>(&newsGroupCount),
|
|
sizeof(newsGroupCount)
|
|
);
|
|
|
|
auto result = std::make_unique<std::vector<NntpListEntry>>();
|
|
for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded)
|
|
{
|
|
NntpListEntry entry;
|
|
m_newsGroupFileIO >> entry;
|
|
result->emplace_back(entry);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void Database::Open(std::filesystem::path dbPath)
|
|
{
|
|
m_databasePath = dbPath;
|
|
if (!std::filesystem::exists(dbPath))
|
|
{
|
|
std::filesystem::create_directory(dbPath);
|
|
}
|
|
OpenNewsGroupFile();
|
|
}
|
|
|
|
void Database::OpenNewsGroupFile()
|
|
{
|
|
if (m_newsGroupFileIO.is_open() && m_newsGroupFileIO.is_open())
|
|
{
|
|
return;
|
|
}
|
|
const std::filesystem::path newsGroupFilePath =
|
|
m_databasePath / "newsgroups.db";
|
|
if (!m_newsGroupFileIO.is_open())
|
|
{
|
|
m_newsGroupFileIO.open(newsGroupFilePath,
|
|
std::ios::binary | std::ios::in | std::ios::out);
|
|
}
|
|
}
|
|
|
|
void Database::UpdateArticleList(
|
|
const std::wstring& newsgroup,
|
|
const std::vector<NntpHeader>& headers)
|
|
{
|
|
const auto articleFile = GetArticleFilePath(newsgroup, true);
|
|
|
|
LockFile(articleFile);
|
|
ScopeExit unlockFile([&](){
|
|
UnlockFile(articleFile);
|
|
});
|
|
|
|
SerializableFile io;
|
|
io.open(articleFile, std::ios::binary | std::ios::out);
|
|
const std::uint64_t articleCount = headers.size();
|
|
io << articleCount;
|
|
for (const auto& header: headers)
|
|
{
|
|
io << header;
|
|
}
|
|
io.close();
|
|
}
|
|
|
|
void Database::UpdateNewsgroupList(const std::vector<NntpListEntry>& list)
|
|
{
|
|
OpenNewsGroupFile();
|
|
|
|
m_newsGroupFileIO.write(
|
|
reinterpret_cast<const char*>(&m_databaseVersion),
|
|
sizeof(m_databaseVersion)
|
|
);
|
|
|
|
const std::uint64_t newsGroupCount = list.size();
|
|
m_newsGroupFileIO << newsGroupCount;
|
|
|
|
for (const auto& entry: list)
|
|
{
|
|
m_newsGroupFileIO << entry;
|
|
}
|
|
m_newsGroupFileIO.flush();
|
|
}
|
|
|
|
void Database::SaveSearchTokens(
|
|
std::uint64_t articleID,
|
|
const std::string& searchString)
|
|
{
|
|
const std::string sstr(searchString);
|
|
StringTreeOperation(
|
|
sstr,
|
|
" ",
|
|
m_maxTreeDepth,
|
|
[&](const std::string& subToken, const std::string& str){
|
|
try
|
|
{
|
|
SaveToken(subToken, str, articleID);
|
|
}
|
|
catch (const SerializeException& e)
|
|
{
|
|
/// @todo do graceful magic here.
|
|
std::cout << "Broken file for \"" << subToken << "\"" << std::endl;
|
|
}
|
|
}
|
|
);
|
|
}
|
|
|
|
void Database::SaveToken(
|
|
const std::string& subtoken,
|
|
const std::string& subject,
|
|
std::uint64_t articleID)
|
|
{
|
|
if (subtoken == "") return;
|
|
if (subtoken == " ") return;
|
|
if (subject == "") return;
|
|
if (subject == " ") return;
|
|
if (HasToken(subtoken, subject, articleID)) return;
|
|
const auto path = GetTokenFilePath(subtoken, true);
|
|
size_t startPos{0};
|
|
size_t endPos{0};
|
|
size_t count{0};
|
|
ArticleEntry entry{};
|
|
bool found = GetArticleEntry(
|
|
subtoken, subject, entry, startPos, endPos, count);
|
|
|
|
std::uint64_t entryCount{0};
|
|
|
|
// Update existing?
|
|
if (!std::filesystem::exists(path))
|
|
{
|
|
// Creating a new token file is pretty simple.
|
|
entryCount = 1;
|
|
SerializableFile io;
|
|
LockFile(path);
|
|
ScopeExit unlockFile([&](){
|
|
if (io.is_open()) io.close();
|
|
UnlockFile(path);
|
|
});
|
|
io.open(path, std::ios::binary | std::ios::out);
|
|
io << entryCount;
|
|
entry.searchString = subject;
|
|
entry.articleIDs.emplace_back(articleID);
|
|
io << entry;
|
|
}
|
|
else
|
|
{
|
|
// Updating an existing token file is a bit more complicated.
|
|
// See if the search string already exists in the token file.
|
|
// Open file.
|
|
SerializableFile io;
|
|
io.open(path, std::ios::binary | std::ios::in | std::ios::out);
|
|
LockFile(path);
|
|
ScopeExit unlockFile([&](){
|
|
if (io.is_open()) io.close();
|
|
UnlockFile(path);
|
|
});
|
|
// Read entry count.
|
|
io >> entryCount;
|
|
if (found)
|
|
{
|
|
/* Seek to the end of the entry and read all following entries. */
|
|
io.seekg(endPos);
|
|
std::vector<ArticleEntry> entries;
|
|
for (auto i = count; i != entryCount; ++i)
|
|
{
|
|
ArticleEntry e{};
|
|
io >> e;
|
|
entries.emplace_back(e);
|
|
}
|
|
// Seek back to the start of the existing entry and rewrite it.
|
|
io.seekg(startPos);
|
|
io << entry;
|
|
// Now re-write all following entries.
|
|
for (auto& e: entries)
|
|
{
|
|
io << e;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// This is a new search string in the token file, append to the end.
|
|
io.seekg(0, std::ios_base::end);
|
|
entry.searchString = subject;
|
|
entry.articleIDs = {articleID};
|
|
io << entry;
|
|
// Increment entry count.
|
|
entryCount++;
|
|
io.seekg(0);
|
|
io << entryCount;
|
|
}
|
|
}
|
|
}
|
|
|
|
} // namespace usenetsearch
|