264 lines
7.9 KiB
C++
264 lines
7.9 KiB
C++
/*
|
|
Copyright© 2021 John Sennesael
|
|
|
|
UsenetSearch is Free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
UsenetSearch is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "usenetsearch/Indexer.h"
|
|
|
|
#include "usenetsearch/Application.h"
|
|
#include "usenetsearch/Logger.h"
|
|
#include "usenetsearch/StringUtils.h"
|
|
|
|
namespace usenetsearch {
|
|
|
|
// SearchResult class ----------------------------------------------------------
|
|
|
|
SearchResult::SearchResult(const ArticleEntry& entry)
|
|
{
|
|
m_newsgroupId = entry.newsgroupID;
|
|
m_articleId = entry.articleID;
|
|
}
|
|
|
|
SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId):
|
|
m_newsgroupId(newsgroupId), m_articleId(articleId)
|
|
{
|
|
|
|
}
|
|
|
|
SearchResult::SearchResult(const SearchResult& other)
|
|
{
|
|
m_articleId = other.m_articleId;
|
|
m_newsgroupId = other.m_newsgroupId;
|
|
m_numHits = other.m_numHits;
|
|
};
|
|
|
|
std::uint32_t SearchResult::ArticleId() const
|
|
{
|
|
return m_articleId;
|
|
}
|
|
|
|
size_t SearchResult::Hits() const
|
|
{
|
|
return m_numHits;
|
|
}
|
|
|
|
void SearchResult::Inc()
|
|
{
|
|
m_numHits++;
|
|
}
|
|
|
|
std::uint32_t SearchResult::NewsgroupId() const
|
|
{
|
|
return m_newsgroupId;
|
|
}
|
|
|
|
void SearchResult::operator=(const SearchResult& other)
|
|
{
|
|
m_articleId = other.m_articleId;
|
|
m_newsgroupId = other.m_newsgroupId;
|
|
m_numHits = other.m_numHits;
|
|
}
|
|
|
|
bool SearchResult::operator==(const SearchResult& other) const
|
|
{
|
|
const bool result =
|
|
(m_articleId == other.m_articleId)
|
|
&& (m_newsgroupId == other.m_newsgroupId);
|
|
return result;
|
|
}
|
|
|
|
bool SearchResult::operator!=(const SearchResult& other) const
|
|
{
|
|
return (
|
|
(m_articleId != other.m_articleId)
|
|
|| (m_newsgroupId != other.m_newsgroupId)
|
|
);
|
|
}
|
|
|
|
bool SearchResult::operator<(const SearchResult& other) const
|
|
{
|
|
return m_numHits < other.m_numHits;
|
|
}
|
|
|
|
bool SearchResult::operator>(const SearchResult& other) const
|
|
{
|
|
return m_numHits > other.m_numHits;
|
|
}
|
|
|
|
bool SearchResult::operator>=(const SearchResult& other) const
|
|
{
|
|
return m_numHits >= other.m_numHits;
|
|
}
|
|
|
|
bool SearchResult::operator<=(const SearchResult& other) const
|
|
{
|
|
return m_numHits <= other.m_numHits;
|
|
}
|
|
|
|
// Indexer class ---------------------------------------------------------------
|
|
|
|
Indexer::Indexer(UsenetClient& client)
|
|
: m_client(client)
|
|
{
|
|
m_threads.MaxThreads(Application::Get().GetConfig().MaxThreads());
|
|
}
|
|
|
|
void Indexer::Connect()
|
|
{
|
|
m_client.Connect(
|
|
Application::Get().GetConfig().NNTPServerHost(),
|
|
Application::Get().GetConfig().NNTPServerPort(),
|
|
Application::Get().GetConfig().NNTPServerSSL()
|
|
);
|
|
m_client.Authenticate(
|
|
m_conv.from_bytes(Application::Get().GetConfig().NNTPServerUser()),
|
|
m_conv.from_bytes(Application::Get().GetConfig().NNTPServerPassword())
|
|
);
|
|
}
|
|
|
|
void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
|
|
{
|
|
const size_t batchSize = Application::Get().GetConfig().BatchSize();
|
|
for (const auto& group: newsgroups)
|
|
{
|
|
const std::wstring newsgroup = m_conv.from_bytes(group.name);
|
|
Logger::Get().Debug(LOGID("Indexer"), "Setting group to " + group.name);
|
|
m_client.Group(newsgroup);
|
|
Logger::Get().Debug(
|
|
LOGID("Indexer"),
|
|
"Reading headers in " + group.name + " "
|
|
+ "(batch size = " + std::to_string(batchSize) + " headers)."
|
|
);
|
|
std::atomic<std::uint64_t> headerCount{0};
|
|
const std::atomic<std::uint64_t> groupID = group.id;
|
|
std::uint32_t startMessage = 0;
|
|
try
|
|
{
|
|
startMessage = Application::Get().GetDb().GetLastIndexedArticle(
|
|
groupID
|
|
);
|
|
if (startMessage == NntpListEntry::NOT_INDEXED)
|
|
{
|
|
startMessage = 0;
|
|
}
|
|
else
|
|
{
|
|
++startMessage;
|
|
}
|
|
}
|
|
catch (const DatabaseException&)
|
|
{
|
|
startMessage = 0;
|
|
}
|
|
Logger::Get().Debug(
|
|
LOGID("Indexer"),
|
|
"Indexing starting at message: "
|
|
+ std::to_string(startMessage)
|
|
);
|
|
if (Application::Get().ShouldStop()) return;
|
|
m_client.ProcessHeaders(startMessage,
|
|
[this, &startMessage, &headerCount, &groupID](std::shared_ptr<NntpHeaders> headers){
|
|
if (Application::Get().ShouldStop()) return;
|
|
m_threads.Queue([this, headers, &startMessage, &headerCount, &groupID](){
|
|
std::uint64_t lastArticle{0};
|
|
for (const auto& header: *headers)
|
|
{
|
|
if (Application::Get().ShouldStop()) return;
|
|
const std::uint64_t articleID{header.articleID};
|
|
std::string subject = header.subject;
|
|
subject = Application::Get().GetFilter().ProcessSearchString(
|
|
subject
|
|
);
|
|
if (subject == "") continue;
|
|
Application::Get().GetDb().SaveSearchTokens(
|
|
groupID,
|
|
articleID,
|
|
subject
|
|
);
|
|
headerCount++;
|
|
if (articleID > lastArticle) lastArticle = articleID;
|
|
}
|
|
if (lastArticle == NntpListEntry::NOT_INDEXED)
|
|
{
|
|
lastArticle = 0;
|
|
}
|
|
// Update last-indexed id for the newsgroup.
|
|
if (startMessage < lastArticle)
|
|
{
|
|
Application::Get().GetDb().SetLastIndexedArticle(
|
|
groupID, lastArticle
|
|
);
|
|
}
|
|
Logger::Get().Debug("Indexer::Index", "Finished batch.");
|
|
});
|
|
},
|
|
batchSize
|
|
);
|
|
m_threads.JoinThreads();
|
|
Logger::Get().Debug(
|
|
LOGID("Indexer"),
|
|
"Saved " + std::to_string(headerCount) + " headers."
|
|
);
|
|
}
|
|
}
|
|
|
|
std::unique_ptr<SearchResults> Indexer::Search(
|
|
const std::string& searchString)
|
|
{
|
|
auto result = std::make_unique<SearchResults>();
|
|
const std::string sstr = Application::Get().GetFilter().ProcessSearchString(
|
|
searchString
|
|
);
|
|
const auto searchHash = StringHashBytes(sstr);
|
|
auto searchResults = Application::Get().GetDb().Search(sstr);
|
|
if (!searchResults) return result;
|
|
for(const ArticleEntry& entry: *searchResults)
|
|
{
|
|
if (Application::Get().ShouldStop())
|
|
{
|
|
Logger::Get().Fatal<UsenetSearchException>(
|
|
"Indexer",
|
|
"Interrupted."
|
|
);
|
|
}
|
|
SearchResult sr(entry);
|
|
// Check if a matching entry already exists in the result set, if so,
|
|
// increment count. Otherwise, append a new entry.
|
|
auto it = std::find(result->begin(), result->end(), sr);
|
|
if (it != result->end())
|
|
{
|
|
(*it).Inc();
|
|
// An exact match gets double points to ensure it's above other
|
|
// partial matches.
|
|
if (entry.hash == searchHash) (*it).Inc();
|
|
}
|
|
else
|
|
{
|
|
sr.Inc();
|
|
result->emplace_back(sr);
|
|
}
|
|
}
|
|
std::sort(
|
|
result->begin(),
|
|
result->end(),
|
|
[](const SearchResult& a, const SearchResult& b){
|
|
return a.Hits() > b.Hits();
|
|
}
|
|
);
|
|
return result;
|
|
}
|
|
|
|
} // namespace usenetsearch
|