/* Copyright© 2021 John Sennesael UsenetSearch is Free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. UsenetSearch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with UsenetSearch. If not, see . */ #include "usenetsearch/Indexer.h" #include "usenetsearch/Application.h" #include "usenetsearch/Logger.h" #include "usenetsearch/StringUtils.h" namespace usenetsearch { // SearchResult class ---------------------------------------------------------- SearchResult::SearchResult(const ArticleEntry& entry) { m_newsgroupId = entry.newsgroupID; m_articleId = entry.articleID; } SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId): m_newsgroupId(newsgroupId), m_articleId(articleId) { } SearchResult::SearchResult(const SearchResult& other) { m_articleId = other.m_articleId; m_newsgroupId = other.m_newsgroupId; m_numHits = other.m_numHits; }; std::uint32_t SearchResult::ArticleId() const { return m_articleId; } size_t SearchResult::Hits() const { return m_numHits; } void SearchResult::Inc() { m_numHits++; } std::uint32_t SearchResult::NewsgroupId() const { return m_newsgroupId; } void SearchResult::operator=(const SearchResult& other) { m_articleId = other.m_articleId; m_newsgroupId = other.m_newsgroupId; m_numHits = other.m_numHits; } bool SearchResult::operator==(const SearchResult& other) const { const bool result = (m_articleId == other.m_articleId) && (m_newsgroupId == other.m_newsgroupId); return result; } bool SearchResult::operator!=(const SearchResult& other) const { return ( (m_articleId != other.m_articleId) || (m_newsgroupId != other.m_newsgroupId) ); } bool SearchResult::operator<(const SearchResult& other) const { return m_numHits < other.m_numHits; } bool SearchResult::operator>(const SearchResult& other) const { return m_numHits > other.m_numHits; } bool SearchResult::operator>=(const SearchResult& other) const { return m_numHits >= other.m_numHits; } bool SearchResult::operator<=(const SearchResult& other) const { return m_numHits <= other.m_numHits; } // Indexer class --------------------------------------------------------------- Indexer::Indexer(UsenetClient& client) : m_client(client) { m_threads.MaxThreads(Application::Get().GetConfig().MaxThreads()); } void Indexer::Connect() { m_client.Connect( Application::Get().GetConfig().NNTPServerHost(), Application::Get().GetConfig().NNTPServerPort(), Application::Get().GetConfig().NNTPServerSSL() ); m_client.Authenticate( m_conv.from_bytes(Application::Get().GetConfig().NNTPServerUser()), m_conv.from_bytes(Application::Get().GetConfig().NNTPServerPassword()) ); } void Indexer::Index(const std::vector& newsgroups) { const size_t batchSize = Application::Get().GetConfig().BatchSize(); for (const auto& group: newsgroups) { const std::wstring newsgroup = m_conv.from_bytes(group.name); Logger::Get().Debug(LOGID("Indexer"), "Setting group to " + group.name); m_client.Group(newsgroup); Logger::Get().Debug( LOGID("Indexer"), "Reading headers in " + group.name + " " + "(batch size = " + std::to_string(batchSize) + " headers)." ); std::atomic headerCount{0}; const std::atomic groupID = group.id; std::uint32_t startMessage = 0; try { startMessage = Application::Get().GetDb().GetLastIndexedArticle( groupID ); if (startMessage == NntpListEntry::NOT_INDEXED) { startMessage = 0; } else { ++startMessage; } } catch (const DatabaseException&) { startMessage = 0; } Logger::Get().Debug( LOGID("Indexer"), "Indexing starting at message: " + std::to_string(startMessage) ); if (Application::Get().ShouldStop()) return; m_client.ProcessHeaders(startMessage, [this, &startMessage, &headerCount, &groupID](std::shared_ptr headers){ if (Application::Get().ShouldStop()) return; m_threads.Queue([this, headers, &startMessage, &headerCount, &groupID](){ std::uint64_t lastArticle{0}; for (const auto& header: *headers) { if (Application::Get().ShouldStop()) return; const std::uint64_t articleID{header.articleID}; std::string subject = header.subject; subject = Application::Get().GetFilter().ProcessSearchString( subject ); if (subject == "") continue; Application::Get().GetDb().SaveSearchTokens( groupID, articleID, subject ); headerCount++; if (articleID > lastArticle) lastArticle = articleID; } if (lastArticle == NntpListEntry::NOT_INDEXED) { lastArticle = 0; } // Update last-indexed id for the newsgroup. if (startMessage < lastArticle) { Application::Get().GetDb().SetLastIndexedArticle( groupID, lastArticle ); } Logger::Get().Debug("Indexer::Index", "Finished batch."); }); }, batchSize ); m_threads.JoinThreads(); Logger::Get().Debug( LOGID("Indexer"), "Saved " + std::to_string(headerCount) + " headers." ); } } std::unique_ptr Indexer::Search( const std::string& searchString) { auto result = std::make_unique(); const std::string sstr = Application::Get().GetFilter().ProcessSearchString( searchString ); const auto searchHash = StringHashBytes(sstr); auto searchResults = Application::Get().GetDb().Search(sstr); if (!searchResults) return result; for(const ArticleEntry& entry: *searchResults) { if (Application::Get().ShouldStop()) { Logger::Get().Fatal( "Indexer", "Interrupted." ); } SearchResult sr(entry); // Check if a matching entry already exists in the result set, if so, // increment count. Otherwise, append a new entry. auto it = std::find(result->begin(), result->end(), sr); if (it != result->end()) { (*it).Inc(); // An exact match gets double points to ensure it's above other // partial matches. if (entry.hash == searchHash) (*it).Inc(); } else { sr.Inc(); result->emplace_back(sr); } } std::sort( result->begin(), result->end(), [](const SearchResult& a, const SearchResult& b){ return a.Hits() > b.Hits(); } ); return result; } } // namespace usenetsearch