204 lines
6.9 KiB
C++
204 lines
6.9 KiB
C++
/*
|
|
Copyright© 2021 John Sennesael
|
|
|
|
UsenetSearch is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
UsenetSearch is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with UsenetSearch. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <atomic>
|
|
#include <codecvt>
|
|
#include <iostream>
|
|
#include <locale>
|
|
#include <memory>
|
|
#include <thread>
|
|
|
|
#include "usenetsearch/Configuration.h"
|
|
#include "usenetsearch/Database.h"
|
|
#include "usenetsearch/Except.h"
|
|
#include "usenetsearch/StringUtils.h"
|
|
#include "usenetsearch/ThreadPool.h"
|
|
#include "usenetsearch/UsenetClient.h"
|
|
|
|
using namespace usenetsearch;
|
|
|
|
void Usage(const std::string& programName)
|
|
{
|
|
std::cout << "UsenetSearch - usenet search indexer" << std::endl;
|
|
std::cout << "Copyright© 2021 John Sennesael" << std::endl << std::endl;
|
|
std::cout << "Usage:" << std::endl << std::endl;
|
|
std::cout << programName;
|
|
std::cout << "\t";
|
|
std::cout << "[-c <config filename>] ";
|
|
std::cout << "[-h] " << std::endl << std::endl;
|
|
std::cout << "-c <file>\tSets configuration file to use" << std::endl;
|
|
std::cout << "-h\t\tShow help (this text)." << std::endl;
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
std::cout.setf(std::ios::unitbuf);
|
|
|
|
std::string configFile{"usenetsearch.conf"};
|
|
|
|
// Parse args.
|
|
for (int argn = 1; argn != argc; ++argn)
|
|
{
|
|
std::string curr_opt = argv[argn];
|
|
std::string next_opt = "";
|
|
if (argn+1 < argc) next_opt=argv[argn+1];
|
|
if (curr_opt == "-c")
|
|
{
|
|
if ((next_opt == "") or (StringStartsWith("-", next_opt)))
|
|
{
|
|
std::cerr << "Missing argument to -c option." << std::endl;
|
|
Usage(argv[0]);
|
|
return 1;
|
|
}
|
|
argn++;
|
|
configFile = argv[argn];
|
|
}
|
|
else if (curr_opt == "-h")
|
|
{
|
|
Usage(argv[0]);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
// Read config, setup db
|
|
Configuration config;
|
|
config.Open(configFile);
|
|
Database db;
|
|
db.Open(config.DatabasePath());
|
|
|
|
// Start nntp client.
|
|
ThreadPool threads;
|
|
UsenetClient client;
|
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
|
|
try
|
|
{
|
|
client.Connect(
|
|
config.NNTPServerHost(),
|
|
config.NNTPServerPort(),
|
|
config.NNTPServerSSL()
|
|
);
|
|
client.Authenticate(
|
|
conv.from_bytes(config.NNTPServerUser()),
|
|
conv.from_bytes(config.NNTPServerPassword())
|
|
);
|
|
|
|
// BEGIN TEMPORARY TEST CODE
|
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> conv;
|
|
std::unique_ptr<std::vector<NntpListEntry>> list;
|
|
// try
|
|
// {
|
|
// list = db.LoadNewsgroupList();
|
|
// }
|
|
// catch (const DatabaseException& e)
|
|
// {
|
|
// // noop
|
|
// }
|
|
|
|
NntpListEntry e{};
|
|
e.count = 100;
|
|
|
|
// 1001 headers
|
|
// e.name = "comp.os.os2.comm";
|
|
|
|
// 2541 headers
|
|
// e.name = "borland.public.cppbuilder.commandlinetools";
|
|
|
|
// 100026 headers (1859952 K) (1816.35 M)
|
|
e.name = "dk.videnskab";
|
|
// 1000437 headers
|
|
// e.name = "alt.bible";
|
|
|
|
// a million or so, but this one is very slow because all subjects look the
|
|
// same, so everything goes to the same token index, which means we're
|
|
// constantly waiting on a file lock.
|
|
// e.name = "usenetserver.test";
|
|
|
|
list = std::make_unique<std::vector<NntpListEntry>>();
|
|
list->emplace_back(e);
|
|
if ((list == nullptr) || (list->empty()))
|
|
{
|
|
std::cout << "Getting newsgroup list...";
|
|
std::cout.flush();
|
|
list = client.List();
|
|
db.UpdateNewsgroupList(*list);
|
|
std::cout << "DONE." << std::endl;
|
|
std::cout.flush();
|
|
}
|
|
std::cout << "Number of newsgroups in newsgroup: "
|
|
<< list->size() << std::endl;
|
|
std::cout.flush();
|
|
const size_t batchSize = config.BatchSize();
|
|
threads.MaxThreads(config.MaxThreads());
|
|
for (const auto& group: *list)
|
|
{
|
|
const std::wstring newsgroup = conv.from_bytes(group.name);
|
|
std::cout << "Setting group to " << group.name << "...";
|
|
std::cout.flush();
|
|
client.Group(newsgroup);
|
|
std::cout << "DONE." << std::endl;
|
|
std::cout << "Reading headers in " << group.name << " "
|
|
<< "(.=" << batchSize << " headers)." << std::endl;
|
|
std::cout.flush();
|
|
std::atomic<std::uint64_t> headerCount{0};
|
|
std::reference_wrapper<Database> dbref = std::ref(db);
|
|
client.ProcessHeaders(0,
|
|
[&threads, &headerCount, dbref](std::shared_ptr<NntpHeaders> headers){
|
|
threads.Queue([headers, &headerCount, dbref](){
|
|
for (const auto& header: *headers)
|
|
{
|
|
const std::uint64_t id{header.articleID};
|
|
std::string subject = StringRemove(
|
|
StringToLower(header.subject), std::string{"re:"}
|
|
);
|
|
subject.erase(
|
|
std::remove_if(
|
|
subject.begin(), subject.end(),
|
|
[](char c){
|
|
if (std::isspace(c)) return false;
|
|
if ((c > 65) && (c < 90)) return false;
|
|
if ((c > 97) && (c < 122)) return false;
|
|
if (c == '\'') return false;
|
|
if ((c > 48) && (c < 57)) return false;
|
|
return true;
|
|
}), subject.end()
|
|
);
|
|
dbref.get().SaveSearchTokens(id, subject);
|
|
headerCount++;
|
|
}
|
|
std::cout << ".";
|
|
std::cout.flush();
|
|
});
|
|
},
|
|
batchSize
|
|
);
|
|
threads.JoinThreads();
|
|
std::cout << "DONE." << std::endl;
|
|
std::cout << "Saved " << headerCount << " headers." << std::endl;
|
|
std::cout.flush();
|
|
}
|
|
// END TEMPORARY TEST CODE
|
|
|
|
}
|
|
catch (const UsenetSearchException& e)
|
|
{
|
|
std::cerr << e.what() << std::endl;;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|