UsenetSearch/src/main.cpp

204 lines
6.9 KiB
C++

/*
Copyright© 2021 John Sennesael
UsenetSearch is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
UsenetSearch is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
*/
#include <atomic>
#include <codecvt>
#include <iostream>
#include <locale>
#include <memory>
#include <thread>
#include "usenetsearch/Configuration.h"
#include "usenetsearch/Database.h"
#include "usenetsearch/Except.h"
#include "usenetsearch/StringUtils.h"
#include "usenetsearch/ThreadPool.h"
#include "usenetsearch/UsenetClient.h"
using namespace usenetsearch;
void Usage(const std::string& programName)
{
std::cout << "UsenetSearch - usenet search indexer" << std::endl;
std::cout << "Copyright© 2021 John Sennesael" << std::endl << std::endl;
std::cout << "Usage:" << std::endl << std::endl;
std::cout << programName;
std::cout << "\t";
std::cout << "[-c <config filename>] ";
std::cout << "[-h] " << std::endl << std::endl;
std::cout << "-c <file>\tSets configuration file to use" << std::endl;
std::cout << "-h\t\tShow help (this text)." << std::endl;
std::cout << std::endl;
}
int main(int argc, char* argv[])
{
std::cout.setf(std::ios::unitbuf);
std::string configFile{"usenetsearch.conf"};
// Parse args.
for (int argn = 1; argn != argc; ++argn)
{
std::string curr_opt = argv[argn];
std::string next_opt = "";
if (argn+1 < argc) next_opt=argv[argn+1];
if (curr_opt == "-c")
{
if ((next_opt == "") or (StringStartsWith("-", next_opt)))
{
std::cerr << "Missing argument to -c option." << std::endl;
Usage(argv[0]);
return 1;
}
argn++;
configFile = argv[argn];
}
else if (curr_opt == "-h")
{
Usage(argv[0]);
return 0;
}
}
// Read config, setup db
Configuration config;
config.Open(configFile);
Database db;
db.Open(config.DatabasePath());
// Start nntp client.
ThreadPool threads;
UsenetClient client;
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
try
{
client.Connect(
config.NNTPServerHost(),
config.NNTPServerPort(),
config.NNTPServerSSL()
);
client.Authenticate(
conv.from_bytes(config.NNTPServerUser()),
conv.from_bytes(config.NNTPServerPassword())
);
// BEGIN TEMPORARY TEST CODE
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
std::unique_ptr<std::vector<NntpListEntry>> list;
// try
// {
// list = db.LoadNewsgroupList();
// }
// catch (const DatabaseException& e)
// {
// // noop
// }
NntpListEntry e{};
e.count = 100;
// 1001 headers
// e.name = "comp.os.os2.comm";
// 2541 headers
// e.name = "borland.public.cppbuilder.commandlinetools";
// 100026 headers (1859952 K) (1816.35 M)
e.name = "dk.videnskab";
// 1000437 headers
// e.name = "alt.bible";
// a million or so, but this one is very slow because all subjects look the
// same, so everything goes to the same token index, which means we're
// constantly waiting on a file lock.
// e.name = "usenetserver.test";
list = std::make_unique<std::vector<NntpListEntry>>();
list->emplace_back(e);
if ((list == nullptr) || (list->empty()))
{
std::cout << "Getting newsgroup list...";
std::cout.flush();
list = client.List();
db.UpdateNewsgroupList(*list);
std::cout << "DONE." << std::endl;
std::cout.flush();
}
std::cout << "Number of newsgroups in newsgroup: "
<< list->size() << std::endl;
std::cout.flush();
const size_t batchSize = config.BatchSize();
threads.MaxThreads(config.MaxThreads());
for (const auto& group: *list)
{
const std::wstring newsgroup = conv.from_bytes(group.name);
std::cout << "Setting group to " << group.name << "...";
std::cout.flush();
client.Group(newsgroup);
std::cout << "DONE." << std::endl;
std::cout << "Reading headers in " << group.name << " "
<< "(.=" << batchSize << " headers)." << std::endl;
std::cout.flush();
std::atomic<std::uint64_t> headerCount{0};
std::reference_wrapper<Database> dbref = std::ref(db);
client.ProcessHeaders(0,
[&threads, &headerCount, dbref](std::shared_ptr<NntpHeaders> headers){
threads.Queue([headers, &headerCount, dbref](){
for (const auto& header: *headers)
{
const std::uint64_t id{header.articleID};
std::string subject = StringRemove(
StringToLower(header.subject), std::string{"re:"}
);
subject.erase(
std::remove_if(
subject.begin(), subject.end(),
[](char c){
if (std::isspace(c)) return false;
if ((c > 65) && (c < 90)) return false;
if ((c > 97) && (c < 122)) return false;
if (c == '\'') return false;
if ((c > 48) && (c < 57)) return false;
return true;
}), subject.end()
);
dbref.get().SaveSearchTokens(id, subject);
headerCount++;
}
std::cout << ".";
std::cout.flush();
});
},
batchSize
);
threads.JoinThreads();
std::cout << "DONE." << std::endl;
std::cout << "Saved " << headerCount << " headers." << std::endl;
std::cout.flush();
}
// END TEMPORARY TEST CODE
}
catch (const UsenetSearchException& e)
{
std::cerr << e.what() << std::endl;;
return 1;
}
return 0;
}