/* Copyright© 2021 John Sennesael UsenetSearch is Free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. UsenetSearch is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with UsenetSearch. If not, see . */ #include #include "usenetsearch/StringUtils.h" #include "usenetsearch/Indexer.h" namespace usenetsearch { // SearchResult class ---------------------------------------------------------- SearchResult::SearchResult(const ArticleEntry& entry) { m_newsgroupId = entry.newsgroupID; m_articleId = entry.articleID; } SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId): m_newsgroupId(newsgroupId), m_articleId(articleId) { } SearchResult::SearchResult(const SearchResult& other) { m_articleId = other.m_articleId; m_newsgroupId = other.m_newsgroupId; m_numHits = other.m_numHits; }; std::uint32_t SearchResult::ArticleId() const { return m_articleId; } size_t SearchResult::Hits() const { return m_numHits; } void SearchResult::Inc() { m_numHits++; } std::uint32_t SearchResult::NewsgroupId() const { return m_newsgroupId; } void SearchResult::operator=(const SearchResult& other) { m_articleId = other.m_articleId; m_newsgroupId = other.m_newsgroupId; m_numHits = other.m_numHits; } bool SearchResult::operator==(const SearchResult& other) const { const bool result = (m_articleId == other.m_articleId) && (m_newsgroupId == other.m_newsgroupId); return result; } bool SearchResult::operator!=(const SearchResult& other) const { return ( (m_articleId != other.m_articleId) || (m_newsgroupId != other.m_newsgroupId) ); } bool SearchResult::operator<(const SearchResult& other) const { return m_numHits < other.m_numHits; } bool SearchResult::operator>(const SearchResult& other) const { return m_numHits > other.m_numHits; } bool SearchResult::operator>=(const SearchResult& other) const { return m_numHits >= other.m_numHits; } bool SearchResult::operator<=(const SearchResult& other) const { return m_numHits <= other.m_numHits; } // Indexer class --------------------------------------------------------------- Indexer::Indexer(Application& app, UsenetClient& client) : m_app(app), m_client(client) { m_threads.MaxThreads(m_app.GetConfig().MaxThreads()); } void Indexer::Connect() { m_client.Connect( m_app.GetConfig().NNTPServerHost(), m_app.GetConfig().NNTPServerPort(), m_app.GetConfig().NNTPServerSSL() ); m_client.Authenticate( m_conv.from_bytes(m_app.GetConfig().NNTPServerUser()), m_conv.from_bytes(m_app.GetConfig().NNTPServerPassword()) ); } void Indexer::Index(const std::vector& newsgroups) { /** * @todo Replace all stdout stuff with Logger class. */ const size_t batchSize = m_app.GetConfig().BatchSize(); for (const auto& group: newsgroups) { const std::wstring newsgroup = m_conv.from_bytes(group.name); std::cout << "Setting group to " << group.name << "..."; std::cout.flush(); m_client.Group(newsgroup); std::cout << "DONE." << std::endl; std::cout << "Reading headers in " << group.name << " " << "(.=" << batchSize << " headers)." << std::endl; std::cout.flush(); std::atomic headerCount{0}; const std::atomic groupID = group.id; std::reference_wrapper dbref = std::ref(m_app.GetDb()); const std::uint32_t startMessage = dbref.get().GetLastIndexedArticle( groupID ); std::cout << "Indexing starting at message: " << std::to_string(startMessage) << std::endl; m_client.ProcessHeaders(startMessage, [this, &headerCount, &dbref, &groupID](std::shared_ptr headers){ m_threads.Queue([this, headers, &headerCount, &dbref, &groupID](){ std::uint64_t lastArticle{0}; for (const auto& header: *headers) { const std::uint64_t articleID{header.articleID}; std::string subject = header.subject; subject = m_app.GetFilter().ProcessSearchString( subject ); if (subject == "") continue; dbref.get().SaveSearchTokens( groupID, articleID, subject ); headerCount++; if (articleID > lastArticle) lastArticle = articleID; } // Update last-indexed id for the newsgroup. const std::uint32_t lastIndexedID = dbref.get().GetLastIndexedArticle(groupID); if (lastIndexedID < lastArticle) { dbref.get().SetLastIndexedArticle( groupID, lastArticle ); } std::cout << "."; std::cout.flush(); }); }, batchSize ); m_threads.JoinThreads(); std::cout << "DONE." << std::endl; std::cout << "Saved " << headerCount << " headers." << std::endl; std::cout.flush(); } } std::unique_ptr Indexer::Search( const std::string& searchString) { auto result = std::make_unique(); const std::string sstr = m_app.GetFilter().ProcessSearchString( searchString ); auto searchResults = m_app.GetDb().Search(sstr); if (!searchResults) return result; for(const ArticleEntry& entry: *searchResults) { SearchResult sr(entry); // Check if a matching entry already exists in the result set, if so, // increment count. Otherwise, append a new entry. auto it = std::find(result->begin(), result->end(), sr); if (it != result->end()) { (*it).Inc(); } else { sr.Inc(); result->emplace_back(sr); } } std::sort( result->begin(), result->end(), [](const SearchResult& a, const SearchResult& b){ return a.Hits() > b.Hits(); } ); return result; } } // namespace usenetsearch