/* Copyright© 2021 John Sennesael UsenetSearch is Free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. UsenetSearch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with UsenetSearch. If not, see . */ #include #include #include #include #include #include #include #include #include "usenetsearch/Application.h" #include "usenetsearch/StringUtils.h" #include "usenetsearch/UsenetClient.h" #include "usenetsearch/ScopeExit.h" #include "usenetsearch/Serialize.h" #include "usenetsearch/Database.h" namespace usenetsearch { // Database class -------------------------------------------------------------- Database::Database(Application& app): m_app(app) { } Database::~Database() { m_newsGroupFileIO.Close(); } std::unique_ptr Database::FindNntpEntry( const std::string& subject) { OpenNewsGroupFile(); ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64(); std::unique_ptr result = nullptr; for (std::uint64_t n = 0; n != numGroups; ++n) { NntpListEntry entry; m_newsGroupFileIO >> entry; if (entry.name == subject) { result = std::make_unique(entry); break; } } return result; } std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID) { OpenNewsGroupFile(); ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); const std::uint64_t numGroups = m_newsGroupFileIO.ReadInt64(); for (std::uint64_t n = 0; n != numGroups; ++n) { NntpListEntry entry; m_newsGroupFileIO >> entry; if (entry.id == newsgroupID) { return entry.lastIndexedArticle; } } return 0; } std::filesystem::path Database::GetTokenFilePath( const std::string& token, bool mkdirs ) { const std::string md5 = StringHash(token); const std::string tok1 = md5.substr(0, 2); const std::string tok2 = md5.substr(2, 1); const std::string tok3 = md5.substr(3, 2); const std::filesystem::path groupPath = m_databasePath / "tokens" / tok1 / tok2; if (mkdirs) { if (!std::filesystem::exists(groupPath)) { std::filesystem::create_directories(groupPath); } } const auto groupFile = tok3 + ".db"; return groupPath / groupFile; } std::uint64_t Database::GetUniqueNntpEntryId( const std::vector& list) const { std::uint64_t result{0}; for (auto& entry: list) { if (result <= entry.id) { result = entry.id + 1; } } return result; } void Database::MaxTreeDepth(std::uint8_t depth) { m_maxTreeDepth = depth; } std::unique_ptr> Database::LoadNewsgroupList() { OpenNewsGroupFile(); ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); const size_t newsGroupCount = m_newsGroupFileIO.ReadInt64(); auto result = std::make_unique>(); for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded) { NntpListEntry entry; m_newsGroupFileIO >> entry; result->emplace_back(entry); } return result; } void Database::Open(std::filesystem::path dbPath) { m_databasePath = dbPath; if (!std::filesystem::exists(dbPath)) { std::filesystem::create_directory(dbPath); } } void Database::OpenNewsGroupFile() { if (m_newsGroupFileIO.IsOpen()) { m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg); return; } const std::filesystem::path newsGroupFilePath = m_databasePath / "newsgroups.db"; const bool exists = std::filesystem::exists(newsGroupFilePath); m_newsGroupFileIO.Open(newsGroupFilePath); if (exists) { const std::uint64_t ver = m_newsGroupFileIO.ReadInt64(); if (ver != m_databaseVersion) { throw DatabaseException(EBADF, std::string{"Mismatching newgroup file database version:"} + " have: " + std::to_string(ver) + " - want: " + std::to_string(m_databaseVersion) ); } } else { m_newsGroupFileIO << m_databaseVersion; m_newsGroupFileIO << std::uint64_t{0}; // newsgroup count. m_newsGroupFileIO.Seek(sizeof(m_databaseVersion), std::ios_base::beg); } } void Database::ParseTokenFile( const std::filesystem::path& dbFile, std::function onParse) { if (!std::filesystem::exists(dbFile)) { throw DatabaseException( ENOTFOUND, "File does not exist: " + dbFile.string() ); } SerializableFile io; io.Open(dbFile); const std::uint64_t tokenCount = io.ReadInt64(); for (std::uint64_t i = 0; i != tokenCount; ++i) { ArticleEntry token; io >> token; onParse(token); } } void Database::SetLastIndexedArticle( std::uint64_t newsgroupID, std::int32_t articleID) { auto outItems = LoadNewsgroupList(); bool found{false}; if (outItems) { for (auto& entry: *outItems) { if (entry.id == newsgroupID) { entry.lastIndexedArticle = articleID; found = true; } } } if (!found) { throw DatabaseException(EINVAL, "Attempt to update newsgroup not found in database - id: " + std::to_string(newsgroupID)); } UpdateNewsgroupList(*outItems); } void Database::SaveSearchTokens( std::uint64_t newsgroupID, std::uint64_t articleID, const std::string& searchString) { const std::string sstr(searchString); StringTreeOperation( sstr, " ", m_maxTreeDepth, [&](const std::string& subToken, const std::string& str){ const std::string tok = m_app.GetFilter().ProcessToken( subToken, str ); if (tok.empty()) return; SaveToken(tok, newsgroupID, articleID); } ); } bool Database::HasToken( const std::string& subtoken, std::uint64_t newsgroupID, std::uint32_t articleID) { if (subtoken.empty()) return false; const std::filesystem::path path = GetTokenFilePath(subtoken, true); if (!std::filesystem::exists(path)) return false; SerializableFile io; io.Open(path); const std::uint64_t tokenCount = io.ReadInt64(); for (std::uint64_t i = 0; i != tokenCount; ++i) { ArticleEntry token; io >> token; if (token.newsgroupID == newsgroupID) { if (token.articleID == articleID) return true; } } return false; } std::unique_ptr> Database::LoadTokens( const std::filesystem::path dbFile, const std::string& subtoken) { auto result = std::make_unique>(); if (!std::filesystem::exists(dbFile)) return result; SerializableFile io; io.Open(dbFile); const std::uint64_t tokenCount = io.ReadInt64(); const auto tokenHash = StringHashBytes(subtoken); for (std::uint64_t ntok = 0; ntok != tokenCount; ++ntok) { ArticleEntry entry{}; io >> entry; if (entry.hash != tokenHash) continue; result->emplace_back(entry); } return result; } void Database::SaveToken( const std::string& subtoken, std::uint64_t newsgroupID, std::uint32_t articleID) { if (subtoken.empty()) return; const std::filesystem::path path = GetTokenFilePath(subtoken, true); const bool exists = std::filesystem::exists(path); SerializableFile io; io.Open(path); ArticleEntry token{}; token.articleID = articleID; token.newsgroupID = newsgroupID; token.hash = StringHashBytes(subtoken); if (exists) { // Read token count and increment it by one, and write it back. const std::uint64_t tokenCount = io.ReadInt64() + 1; io.Seek(0, std::ios_base::beg); io << tokenCount; // Now seek back to the end of the file to append our token entry. io.Seek(0, std::ios_base::end); } else { // A new file just has a token count of 1 - should already be at pos=0. io << std::uint64_t{1}; } // write out token. #if 0 std::cout << "Token: " << subtoken << std::endl; std::cout << "Saving into file: " << path << std::endl; std::cout << "Token hash: " << HashBytesToString(token.hash) << std::endl << std::endl; #endif io << token; } std::unique_ptr> Database::Search( const std::string& searchString) { auto result = std::make_unique>(); // Tokenize the search string. std::vector searchTokens; StringTreeOperation( searchString, " ", m_maxTreeDepth, [&searchTokens](const std::string& subToken, const std::string&){ searchTokens.emplace_back(subToken); } ); for (const auto& searchToken: searchTokens) { const auto path = GetTokenFilePath(searchToken, false); const bool exists = std::filesystem::exists(path); if (!exists) continue; const auto foundTokens = LoadTokens(path, searchToken); if (foundTokens->empty()) continue; result->insert(result->end(), foundTokens->begin(), foundTokens->end()); std::cout << std::left << std::setw(searchString.length() + 7) << "token: " + searchToken << std::setw(3) << " | " << std::setw(10) << "db file: " << path.string() << std::setw(3) << " | " << std::setw(9) << "#results: " + std::to_string(foundTokens->size()) << std::endl; } return result; } void Database::UpdateNewsgroupList(std::vector& list) { if (list.size() == 0) return; auto outList = LoadNewsgroupList(); for (auto& entry: list) { NntpListEntry newEntry(entry); bool found{false}; if (outList) { std::for_each( outList->begin(), outList->end(), [&entry, &found](NntpListEntry& oldEntry) { if (oldEntry.name == entry.name) { // update existing (copy everything but ID & name) found = true; oldEntry.count = entry.count; oldEntry.high = entry.high; oldEntry.lastIndexedArticle = entry.lastIndexedArticle; oldEntry.low = entry.low; oldEntry.status = entry.status; } } ); } if (found) continue; // add new. newEntry.id = GetUniqueNntpEntryId(*outList); outList->emplace_back(newEntry); entry.id = newEntry.id; } OpenNewsGroupFile(); ScopeExit closeNewsGroupFile([&](){ m_newsGroupFileIO.Close(); }); m_newsGroupFileIO << std::uint64_t{outList->size()}; std::for_each( outList->begin(), outList->end(), [&](const NntpListEntry& e) { m_newsGroupFileIO << e; } ); } } // namespace usenetsearch