UsenetSearch/src/Indexer.cpp

251 lines
7.3 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include "usenetsearch/Indexer.h"
#include "usenetsearch/Logger.h"
#include "usenetsearch/StringUtils.h"
#include <iostream>
namespace usenetsearch {
// SearchResult class ----------------------------------------------------------
SearchResult::SearchResult(const ArticleEntry& entry)
{
m_newsgroupId = entry.newsgroupID;
m_articleId = entry.articleID;
}
SearchResult::SearchResult(std::uint32_t newsgroupId, std::uint32_t articleId):
m_newsgroupId(newsgroupId), m_articleId(articleId)
{
}
SearchResult::SearchResult(const SearchResult& other)
{
m_articleId = other.m_articleId;
m_newsgroupId = other.m_newsgroupId;
m_numHits = other.m_numHits;
};
std::uint32_t SearchResult::ArticleId() const
{
return m_articleId;
}
size_t SearchResult::Hits() const
{
return m_numHits;
}
void SearchResult::Inc()
{
m_numHits++;
}
std::uint32_t SearchResult::NewsgroupId() const
{
return m_newsgroupId;
}
void SearchResult::operator=(const SearchResult& other)
{
m_articleId = other.m_articleId;
m_newsgroupId = other.m_newsgroupId;
m_numHits = other.m_numHits;
}
bool SearchResult::operator==(const SearchResult& other) const
{
const bool result =
(m_articleId == other.m_articleId)
&& (m_newsgroupId == other.m_newsgroupId);
return result;
}
bool SearchResult::operator!=(const SearchResult& other) const
{
return (
(m_articleId != other.m_articleId)
|| (m_newsgroupId != other.m_newsgroupId)
);
}
bool SearchResult::operator<(const SearchResult& other) const
{
return m_numHits < other.m_numHits;
}
bool SearchResult::operator>(const SearchResult& other) const
{
return m_numHits > other.m_numHits;
}
bool SearchResult::operator>=(const SearchResult& other) const
{
return m_numHits >= other.m_numHits;
}
bool SearchResult::operator<=(const SearchResult& other) const
{
return m_numHits <= other.m_numHits;
}
// Indexer class ---------------------------------------------------------------
Indexer::Indexer(Application& app, UsenetClient& client)
: m_app(app), m_client(client)
{
m_threads.MaxThreads(m_app.GetConfig().MaxThreads());
}
void Indexer::Connect()
{
m_client.Connect(
m_app.GetConfig().NNTPServerHost(),
m_app.GetConfig().NNTPServerPort(),
m_app.GetConfig().NNTPServerSSL()
);
m_client.Authenticate(
m_conv.from_bytes(m_app.GetConfig().NNTPServerUser()),
m_conv.from_bytes(m_app.GetConfig().NNTPServerPassword())
);
}
void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
{
/**
* @todo Replace all stdout stuff with Logger class.
*/
const size_t batchSize = m_app.GetConfig().BatchSize();
for (const auto& group: newsgroups)
{
const std::wstring newsgroup = m_conv.from_bytes(group.name);
std::cout << "Setting group to " << group.name << "...";
std::cout.flush();
m_client.Group(newsgroup);
std::cout << "DONE." << std::endl;
std::cout << "Reading headers in " << group.name << " "
<< "(.=" << batchSize << " headers)." << std::endl;
std::cout.flush();
std::atomic<std::uint64_t> headerCount{0};
const std::atomic<std::uint64_t> groupID = group.id;
std::reference_wrapper<Database> dbref = std::ref(m_app.GetDb());
std::uint32_t startMessage = 0;
try
{
startMessage = dbref.get().GetLastIndexedArticle(groupID);
if (startMessage == NntpListEntry::NOT_INDEXED)
{
startMessage = 0;
}
else
{
++startMessage;
}
}
catch (const DatabaseException&)
{
startMessage = 0;
}
std::cout << "Indexing starting at message: "
<< std::to_string(startMessage) << std::endl;
m_client.ProcessHeaders(startMessage,
[this, &startMessage, &headerCount, &dbref, &groupID](std::shared_ptr<NntpHeaders> headers){
m_threads.Queue([this, headers, &startMessage, &headerCount, &dbref, &groupID](){
std::uint64_t lastArticle{0};
for (const auto& header: *headers)
{
const std::uint64_t articleID{header.articleID};
std::string subject = header.subject;
subject = m_app.GetFilter().ProcessSearchString(
subject
);
if (subject == "") continue;
dbref.get().SaveSearchTokens(
groupID,
articleID,
subject
);
headerCount++;
if (articleID > lastArticle) lastArticle = articleID;
}
if (lastArticle == NntpListEntry::NOT_INDEXED)
{
lastArticle = 0;
}
// Update last-indexed id for the newsgroup.
if (startMessage < lastArticle)
{
dbref.get().SetLastIndexedArticle(
groupID, lastArticle
);
}
std::cout << ".";
std::cout.flush();
});
},
batchSize
);
m_threads.JoinThreads();
std::cout << "DONE." << std::endl;
std::cout << "Saved " << headerCount << " headers." << std::endl;
std::cout.flush();
}
}
std::unique_ptr<SearchResults> Indexer::Search(
const std::string& searchString)
{
auto result = std::make_unique<SearchResults>();
const std::string sstr = m_app.GetFilter().ProcessSearchString(
searchString
);
auto searchResults = m_app.GetDb().Search(sstr);
if (!searchResults) return result;
for(const ArticleEntry& entry: *searchResults)
{
SearchResult sr(entry);
// Check if a matching entry already exists in the result set, if so,
// increment count. Otherwise, append a new entry.
auto it = std::find(result->begin(), result->end(), sr);
if (it != result->end())
{
(*it).Inc();
}
else
{
sr.Inc();
result->emplace_back(sr);
}
}
std::sort(
result->begin(),
result->end(),
[](const SearchResult& a, const SearchResult& b){
return a.Hits() > b.Hits();
}
);
return result;
}
} // namespace usenetsearch