/* Copyright© 2021 John Sennesael UsenetSearch is Free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. UsenetSearch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with UsenetSearch. If not, see . */ #include "usenetsearch/Database.h" #include "usenetsearch/Application.h" #include "usenetsearch/Logger.h" #include "usenetsearch/StringUtils.h" #include "usenetsearch/ScopeExit.h" #include "usenetsearch/Serialize.h" #include "usenetsearch/UsenetClient.h" #include #include #include #include #include #include #include namespace usenetsearch { // Database class -------------------------------------------------------------- void Database::CheckDbVersion(const SerializableFile& f) const { f.Seek(0); const std::uint64_t ver = f.ReadInt64(); if (ver != m_databaseVersion) { Logger::Get().Fatal( LOGID("Database"), "Wrong database version - Got: " + std::to_string(ver) + " want: " + std::to_string(m_databaseVersion) ); } } std::unique_ptr Database::FindNntpEntry(std::uint64_t id) { const auto path = GetNewsGroupFilePath(); if (!std::filesystem::exists(path)) return nullptr; SerializableFile io; io.Open(path); CheckDbVersion(io); const std::uint64_t numGroups = io.ReadInt64(); std::unique_ptr result = nullptr; for (std::uint64_t n = 0; n != numGroups; ++n) { NntpListEntry entry; io >> entry; if (entry.id == id) { result = std::make_unique(entry); break; } } return result; } std::unique_ptr Database::FindNntpEntry( const std::string& subject) { const auto path = GetNewsGroupFilePath(); if (!std::filesystem::exists(path)) return nullptr; SerializableFile io; io.Open(path); CheckDbVersion(io); const std::uint64_t numGroups = io.ReadInt64(); std::unique_ptr result = nullptr; for (std::uint64_t n = 0; n != numGroups; ++n) { NntpListEntry entry; io >> entry; if (entry.name == subject) { result = std::make_unique(entry); break; } } return result; } std::uint32_t Database::GetLastIndexedArticle(std::uint64_t newsgroupID) { const auto path = GetNewsGroupFilePath(); if (!std::filesystem::exists(path)) { Logger::Get().Fatal( LOGID("Database"), "No indexed articles for newsgroup: " + std::to_string(newsgroupID) ); } SerializableFile io; io.Open(path); CheckDbVersion(io); const std::uint64_t numGroups = io.ReadInt64(); for (std::uint64_t n = 0; n != numGroups; ++n) { NntpListEntry entry; io >> entry; if (entry.id == newsgroupID) { return entry.lastIndexedArticle; } } Logger::Get().Fatal( LOGID("Database"), "No indexed articles for newsgroup: " + std::to_string(newsgroupID) ); return NntpListEntry::NOT_INDEXED; } std::filesystem::path Database::GetTokenFilePath( const std::string& token, bool mkdirs ) { const std::string md5 = StringHash(token); const std::string tok1 = md5.substr(0, 2); const std::string tok2 = md5.substr(2, 1); const std::string tok3 = md5.substr(3, 2); const std::filesystem::path groupPath = m_databasePath / "tokens" / tok1 / tok2; if (mkdirs) { if (!std::filesystem::exists(groupPath)) { std::filesystem::create_directories(groupPath); } } const auto groupFile = tok3 + ".db"; return groupPath / groupFile; } std::uint64_t Database::GetUniqueNntpEntryId( const std::vector& list) const { std::uint64_t result{0}; for (auto& entry: list) { if (result <= entry.id) { result = entry.id + 1; } } return result; } void Database::MaxTreeDepth(std::uint8_t depth) { m_maxTreeDepth = depth; } std::unique_ptr> Database::LoadNewsgroupList() { auto result = std::make_unique>(); const auto path = GetNewsGroupFilePath(); if (!std::filesystem::exists(path)) return result; SerializableFile io; io.Open(path); CheckDbVersion(io); const size_t newsGroupCount = io.ReadInt64(); for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded) { NntpListEntry entry; io >> entry; result->emplace_back(entry); } return result; } void Database::Open(std::filesystem::path dbPath) { m_databasePath = dbPath; if (!std::filesystem::exists(dbPath)) { std::filesystem::create_directory(dbPath); } } std::filesystem::path Database::GetNewsGroupFilePath() const { return m_databasePath / "newsgroups.db"; } void Database::ParseTokenFile( const std::filesystem::path& dbFile, std::function onParse) { if (!std::filesystem::exists(dbFile)) { Logger::Get().Fatal( LOGID("Database"), "File does not exist: " + dbFile.string() ); } SerializableFile io; io.Open(dbFile); const std::uint64_t tokenCount = io.ReadInt64(); for (std::uint64_t i = 0; i != tokenCount; ++i) { if (Application::Get().ShouldStop()) return; ArticleEntry token; io >> token; onParse(token); } } void Database::SetLastIndexedArticle( std::uint64_t newsgroupID, std::int32_t articleID) { auto outItems = LoadNewsgroupList(); bool found{false}; if (outItems) { for (auto& entry: *outItems) { if (entry.id == newsgroupID) { entry.lastIndexedArticle = articleID; found = true; break; } } } if (!found) { Logger::Get().Fatal( LOGID("Database"), "Attempt to update newsgroup not found in database - id: " + std::to_string(newsgroupID) ); } UpdateNewsgroupList(*outItems); } void Database::SaveSearchTokens( std::uint64_t newsgroupID, std::uint64_t articleID, const std::string& searchString) { const std::string sstr(searchString); StringTreeOperation( sstr, " ", m_maxTreeDepth, [&](const std::string& subToken, const std::string& str){ const std::string tok = Application::Get().GetFilter().ProcessToken( subToken, str ); if (tok.empty()) return; SaveToken(tok, newsgroupID, articleID); } ); } bool Database::HasToken( const std::string& subtoken, std::uint64_t newsgroupID, std::uint32_t articleID) { if (subtoken.empty()) return false; const std::filesystem::path path = GetTokenFilePath(subtoken, true); if (!std::filesystem::exists(path)) return false; SerializableFile io; io.Open(path); const std::uint64_t tokenCount = io.ReadInt64(); for (std::uint64_t i = 0; i != tokenCount; ++i) { ArticleEntry token; io >> token; if (token.newsgroupID == newsgroupID) { if (token.articleID == articleID) return true; } } return false; } std::unique_ptr> Database::LoadTokens( const std::filesystem::path dbFile, const std::string& subtoken) { auto result = std::make_unique>(); if (!std::filesystem::exists(dbFile)) return result; SerializableFile io; io.Open(dbFile); const std::uint64_t tokenCount = io.ReadInt64(); const auto tokenHash = StringHashBytes(subtoken); for (std::uint64_t ntok = 0; ntok != tokenCount; ++ntok) { ArticleEntry entry{}; io >> entry; if (entry.hash != tokenHash) continue; result->emplace_back(entry); } return result; } void Database::SaveToken( const std::string& subtoken, std::uint64_t newsgroupID, std::uint32_t articleID) { if (subtoken.empty()) return; const std::filesystem::path path = GetTokenFilePath(subtoken, true); const bool exists = std::filesystem::exists(path); SerializableFile io; io.Open(path); ArticleEntry token{}; token.articleID = articleID; token.newsgroupID = newsgroupID; token.hash = StringHashBytes(subtoken); if (exists) { // Read token count and increment it by one, and write it back. const std::uint64_t tokenCount = io.ReadInt64() + 1; io.Seek(0, std::ios_base::beg); io << tokenCount; // Now seek back to the end of the file to append our token entry. io.Seek(0, std::ios_base::end); } else { // A new file just has a token count of 1 - should already be at pos=0. io << std::uint64_t{1}; } // write out token. io << token; } std::unique_ptr> Database::Search( const std::string& searchString) { auto result = std::make_unique>(); // Tokenize the search string. std::vector searchTokens; StringTreeOperation( searchString, " ", m_maxTreeDepth, [&searchTokens](const std::string& subToken, const std::string&){ searchTokens.emplace_back(subToken); } ); for (const auto& searchToken: searchTokens) { const auto path = GetTokenFilePath(searchToken, false); const bool exists = std::filesystem::exists(path); if (!exists) continue; const auto foundTokens = LoadTokens(path, searchToken); if (foundTokens->empty()) continue; result->insert(result->end(), foundTokens->begin(), foundTokens->end()); } return result; } void Database::SyncLastUpdated( NntpListEntry& a, NntpListEntry& b ) { // If both are equal, nothing to do here. if (a.lastIndexedArticle == b.lastIndexedArticle) return; // Whichever one's not indexed, gets the value of the other. if (a.lastIndexedArticle == NntpListEntry::NOT_INDEXED) { a.lastIndexedArticle = b.lastIndexedArticle; return; } if (b.lastIndexedArticle == NntpListEntry::NOT_INDEXED) { b.lastIndexedArticle = a.lastIndexedArticle; return; } // Otherwise, whichever's higher wins. if (a.lastIndexedArticle > b.lastIndexedArticle) { b.lastIndexedArticle = a.lastIndexedArticle; } else { a.lastIndexedArticle = b.lastIndexedArticle; } } void Database::UpdateNewsgroupList(std::vector& list) { if (list.size() == 0) return; auto outList = LoadNewsgroupList(); for (auto& entry: list) { bool found{false}; if (outList) { std::for_each( outList->begin(), outList->end(), [this, &entry, &found](NntpListEntry& oldEntry) { if (oldEntry.name == entry.name) { // update existing- the passed entry has updated counts // and status. found = true; oldEntry.count = entry.count; oldEntry.high = entry.high; oldEntry.low = entry.low; oldEntry.status = entry.status; // As for lastIndexed: whoemever's higher wins. SyncLastUpdated(entry, oldEntry); // The ID should be sticky - whatever's already in the // db is what we care about, so update it in the passed // entries. entry.id = oldEntry.id; } } ); } if (found) continue; // add new. NntpListEntry newEntry(entry); newEntry.id = GetUniqueNntpEntryId(*outList); outList->emplace_back(newEntry); entry.id = newEntry.id; } SerializableFile io; io.Open(GetNewsGroupFilePath()); io << m_databaseVersion; io << std::uint64_t{outList->size()}; std::for_each( outList->begin(), outList->end(), [&](const NntpListEntry& e) { io << e; } ); } } // namespace usenetsearch