103 lines
3.8 KiB
C++
103 lines
3.8 KiB
C++
|
/*
|
||
|
Copyright© 2021 John Sennesael
|
||
|
|
||
|
UsenetSearch is Free software: you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation, either version 3 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
UsenetSearch is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
||
|
*/
|
||
|
|
||
|
#include <iostream>
|
||
|
|
||
|
#include "usenetsearch/StringUtils.h"
|
||
|
|
||
|
#include "usenetsearch/Indexer.h"
|
||
|
|
||
|
namespace usenetsearch {
|
||
|
|
||
|
Indexer::Indexer(Application& app, UsenetClient& client)
|
||
|
: m_app(app), m_client(client)
|
||
|
{
|
||
|
m_threads.MaxThreads(m_app.Config().MaxThreads());
|
||
|
}
|
||
|
|
||
|
void Indexer::Connect()
|
||
|
{
|
||
|
m_client.Connect(
|
||
|
m_app.Config().NNTPServerHost(),
|
||
|
m_app.Config().NNTPServerPort(),
|
||
|
m_app.Config().NNTPServerSSL()
|
||
|
);
|
||
|
m_client.Authenticate(
|
||
|
m_conv.from_bytes(m_app.Config().NNTPServerUser()),
|
||
|
m_conv.from_bytes(m_app.Config().NNTPServerPassword())
|
||
|
);
|
||
|
}
|
||
|
|
||
|
void Indexer::Index(const std::vector<NntpListEntry>& newsgroups)
|
||
|
{
|
||
|
/**
|
||
|
* @todo Replace all stdout stuff with Logger class.
|
||
|
* @todo All the filtering (subject.erase bits and such) need to move to a
|
||
|
* separate class.
|
||
|
*/
|
||
|
const size_t batchSize = m_app.Config().BatchSize();
|
||
|
for (const auto& group: newsgroups)
|
||
|
{
|
||
|
const std::wstring newsgroup = m_conv.from_bytes(group.name);
|
||
|
std::cout << "Setting group to " << group.name << "...";
|
||
|
std::cout.flush();
|
||
|
m_client.Group(newsgroup);
|
||
|
std::cout << "DONE." << std::endl;
|
||
|
std::cout << "Reading headers in " << group.name << " "
|
||
|
<< "(.=" << batchSize << " headers)." << std::endl;
|
||
|
std::cout.flush();
|
||
|
std::atomic<std::uint64_t> headerCount{0};
|
||
|
std::reference_wrapper<Database> dbref = std::ref(m_app.Db());
|
||
|
m_client.ProcessHeaders(0,
|
||
|
[this, &headerCount, &dbref](std::shared_ptr<NntpHeaders> headers){
|
||
|
m_threads.Queue([headers, &headerCount, &dbref](){
|
||
|
for (const auto& header: *headers)
|
||
|
{
|
||
|
const std::uint64_t id{header.articleID};
|
||
|
std::string subject = StringRemove(
|
||
|
StringToLower(header.subject), std::string{"re:"}
|
||
|
);
|
||
|
subject.erase(
|
||
|
std::remove_if(
|
||
|
subject.begin(), subject.end(),
|
||
|
[](char c){
|
||
|
if (std::isspace(c)) return false;
|
||
|
if ((c > 65) && (c < 90)) return false;
|
||
|
if ((c > 97) && (c < 122)) return false;
|
||
|
if (c == '\'') return false;
|
||
|
if ((c > 48) && (c < 57)) return false;
|
||
|
return true;
|
||
|
}), subject.end()
|
||
|
);
|
||
|
dbref.get().SaveSearchTokens(1, id, subject);
|
||
|
headerCount++;
|
||
|
}
|
||
|
std::cout << ".";
|
||
|
std::cout.flush();
|
||
|
});
|
||
|
},
|
||
|
batchSize
|
||
|
);
|
||
|
m_threads.JoinThreads();
|
||
|
std::cout << "DONE." << std::endl;
|
||
|
std::cout << "Saved " << headerCount << " headers." << std::endl;
|
||
|
std::cout.flush();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
} // namespace usenetsearch
|