UsenetSearch/src/Indexer.cpp

264 lines
7.9 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include "usenetsearch/Indexer.h"
#include "usenetsearch/Application.h"
#include "usenetsearch/Logger.h"
#include "usenetsearch/StringUtils.h"
namespace usenetsearch {
// SearchResult class ----------------------------------------------------------
SearchResult::SearchResult(const ArticleEntry& entry)
{
m_newsgroupId = entry.newsgroupID;
m_articleId = entry.articleID;
}
SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId):
m_newsgroupId(newsgroupId), m_articleId(articleId)
{
}
SearchResult::SearchResult(const SearchResult& other)
{
m_articleId = other.m_articleId;
m_newsgroupId = other.m_newsgroupId;
m_numHits = other.m_numHits;
};
std::uint32_t SearchResult::ArticleId() const
{
return m_articleId;
}
size_t SearchResult::Hits() const
{
return m_numHits;
}
void SearchResult::Inc()
{
m_numHits++;
}
std::uint32_t SearchResult::NewsgroupId() const
{
return m_newsgroupId;
}
void SearchResult::operator=(const SearchResult& other)
{
m_articleId = other.m_articleId;
m_newsgroupId = other.m_newsgroupId;
m_numHits = other.m_numHits;
}
bool SearchResult::operator==(const SearchResult& other) const
{
const bool result =
(m_articleId == other.m_articleId)
&& (m_newsgroupId == other.m_newsgroupId);
return result;
}
bool SearchResult::operator!=(const SearchResult& other) const
{
return (
(m_articleId != other.m_articleId)
|| (m_newsgroupId != other.m_newsgroupId)
);
}
bool SearchResult::operator<(const SearchResult& other) const
{
return m_numHits < other.m_numHits;
}
bool SearchResult::operator>(const SearchResult& other) const
{
return m_numHits > other.m_numHits;
}
bool SearchResult::operator>=(const SearchResult& other) const
{
return m_numHits >= other.m_numHits;
}
bool SearchResult::operator<=(const SearchResult& other) const
{
return m_numHits <= other.m_numHits;
}
// Indexer class ---------------------------------------------------------------
Indexer::Indexer(UsenetClient& client)
: m_client(client)
{
m_threads.MaxThreads(Application::Get().GetConfig().MaxThreads());
}
void Indexer::Connect()
{
m_client.Connect(
Application::Get().GetConfig().NNTPServerHost(),
Application::Get().GetConfig().NNTPServerPort(),
Application::Get().GetConfig().NNTPServerSSL()
);
m_client.Authenticate(
m_conv.from_bytes(Application::Get().GetConfig().NNTPServerUser()),
m_conv.from_bytes(Application::Get().GetConfig().NNTPServerPassword())
);
}
void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
{
const size_t batchSize = Application::Get().GetConfig().BatchSize();
for (const auto& group: newsgroups)
{
const std::wstring newsgroup = m_conv.from_bytes(group.name);
Logger::Get().Debug(LOGID("Indexer"), "Setting group to " + group.name);
m_client.Group(newsgroup);
Logger::Get().Debug(
LOGID("Indexer"),
"Reading headers in " + group.name + " "
+ "(batch size = " + std::to_string(batchSize) + " headers)."
);
std::atomic<std::uint64_t> headerCount{0};
const std::atomic<std::uint64_t> groupID = group.id;
std::uint32_t startMessage = 0;
try
{
startMessage = Application::Get().GetDb().GetLastIndexedArticle(
groupID
);
if (startMessage == NntpListEntry::NOT_INDEXED)
{
startMessage = 0;
}
else
{
++startMessage;
}
}
catch (const DatabaseException&)
{
startMessage = 0;
}
Logger::Get().Debug(
LOGID("Indexer"),
"Indexing starting at message: "
+ std::to_string(startMessage)
);
if (Application::Get().ShouldStop()) return;
m_client.ProcessHeaders(startMessage,
[this, &startMessage, &headerCount, &groupID](std::shared_ptr<NntpHeaders> headers){
if (Application::Get().ShouldStop()) return;
m_threads.Queue([this, headers, &startMessage, &headerCount, &groupID](){
std::uint64_t lastArticle{0};
for (const auto& header: *headers)
{
if (Application::Get().ShouldStop()) return;
const std::uint64_t articleID{header.articleID};
std::string subject = header.subject;
subject = Application::Get().GetFilter().ProcessSearchString(
subject
);
if (subject == "") continue;
Application::Get().GetDb().SaveSearchTokens(
groupID,
articleID,
subject
);
headerCount++;
if (articleID > lastArticle) lastArticle = articleID;
}
if (lastArticle == NntpListEntry::NOT_INDEXED)
{
lastArticle = 0;
}
// Update last-indexed id for the newsgroup.
if (startMessage < lastArticle)
{
Application::Get().GetDb().SetLastIndexedArticle(
groupID, lastArticle
);
}
Logger::Get().Debug("Indexer::Index", "Finished batch.");
});
},
batchSize
);
m_threads.JoinThreads();
Logger::Get().Debug(
LOGID("Indexer"),
"Saved " + std::to_string(headerCount) + " headers."
);
}
}
std::unique_ptr<SearchResults> Indexer::Search(
const std::string& searchString)
{
auto result = std::make_unique<SearchResults>();
const std::string sstr = Application::Get().GetFilter().ProcessSearchString(
searchString
);
const auto searchHash = StringHashBytes(sstr);
auto searchResults = Application::Get().GetDb().Search(sstr);
if (!searchResults) return result;
for(const ArticleEntry& entry: *searchResults)
{
if (Application::Get().ShouldStop())
{
Logger::Get().Fatal<UsenetSearchException>(
"Indexer",
"Interrupted."
);
}
SearchResult sr(entry);
// Check if a matching entry already exists in the result set, if so,
// increment count. Otherwise, append a new entry.
auto it = std::find(result->begin(), result->end(), sr);
if (it != result->end())
{
(*it).Inc();
// An exact match gets double points to ensure it's above other
// partial matches.
if (entry.hash == searchHash) (*it).Inc();
}
else
{
sr.Inc();
result->emplace_back(sr);
}
}
std::sort(
result->begin(),
result->end(),
[](const SearchResult& a, const SearchResult& b){
return a.Hits() > b.Hits();
}
);
return result;
}
} // namespace usenetsearch