UsenetSearch/src/Indexer.cpp

103 lines
3.8 KiB
C++
Raw Normal View History

/*
Copyright© 2021 John Sennesael
UsenetSearch is Free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include <iostream>
#include "usenetsearch/StringUtils.h"
#include "usenetsearch/Indexer.h"
namespace usenetsearch {
Indexer::Indexer(Application& app, UsenetClient& client)
: m_app(app), m_client(client)
{
m_threads.MaxThreads(m_app.Config().MaxThreads());
}
void Indexer::Connect()
{
m_client.Connect(
m_app.Config().NNTPServerHost(),
m_app.Config().NNTPServerPort(),
m_app.Config().NNTPServerSSL()
);
m_client.Authenticate(
m_conv.from_bytes(m_app.Config().NNTPServerUser()),
m_conv.from_bytes(m_app.Config().NNTPServerPassword())
);
}
void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
{
/**
* @todo Replace all stdout stuff with Logger class.
* @todo All the filtering (subject.erase bits and such) need to move to a
* separate class.
*/
const size_t batchSize = m_app.Config().BatchSize();
for (const auto& group: newsgroups)
{
const std::wstring newsgroup = m_conv.from_bytes(group.name);
std::cout << "Setting group to " << group.name << "...";
std::cout.flush();
m_client.Group(newsgroup);
std::cout << "DONE." << std::endl;
std::cout << "Reading headers in " << group.name << " "
<< "(.=" << batchSize << " headers)." << std::endl;
std::cout.flush();
std::atomic<std::uint64_t> headerCount{0};
std::reference_wrapper<Database> dbref = std::ref(m_app.Db());
m_client.ProcessHeaders(0,
[this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
m_threads.Queue([headers, &headerCount, &dbref](){
for (const auto& header: *headers)
{
const std::uint64_t id{header.articleID};
std::string subject = StringRemove(
StringToLower(header.subject), std::string{"re:"}
);
subject.erase(
std::remove_if(
subject.begin(), subject.end(),
[](char c){
if (std::isspace(c)) return false;
if ((c > 65) && (c < 90)) return false;
if ((c > 97) && (c < 122)) return false;
if (c == '\'') return false;
if ((c > 48) && (c < 57)) return false;
return true;
}), subject.end()
);
dbref.get().SaveSearchTokens(1, id, subject);
headerCount++;
}
std::cout << ".";
std::cout.flush();
});
},
batchSize
);
m_threads.JoinThreads();
std::cout << "DONE." << std::endl;
std::cout << "Saved " << headerCount << " headers." << std::endl;
std::cout.flush();
}
}
} // namespace usenetsearch