UsenetSearch/src/Database.cpp

/*
    Copyright© 2021 John Sennesael

    UsenetSearch is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    UsenetSearch is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with UsenetSearch.  If not, see <https://www.gnu.org/licenses/>.
*/

#include <iostream>

#include <chrono>
#include <filesystem>
#include <fstream>
#include <string>
#include <thread>
#include <vector>

#include "usenetsearch/StringUtils.h"
#include "usenetsearch/UsenetClient.h"
#include "usenetsearch/ScopeExit.h"
#include "usenetsearch/Serialize.h"

#include "usenetsearch/Database.h"

namespace usenetsearch {

size_t ArticleEntry::Size() const
{
    return sizeof(articleIDs) + (articleIDs.size() * sizeof(std::uint64_t))
           + sizeof(std::uint64_t) + (searchString.size() * sizeof(char));
}

// Database class --------------------------------------------------------------

Database::~Database()
{
    if (m_newsGroupFileIO.is_open())
    {
        m_newsGroupFileIO.close();
    }
}

std::filesystem::path Database::GetArticleFilePath(
    const std::wstring& newsgroup, bool mkdirs)
{
    const std::string md5 = StringHash(m_conv.to_bytes(newsgroup));
    const std::string tok1 = md5.substr(0, 2);
    const std::string tok2 = md5.substr(2, 2);
    const std::string tok3 = md5.substr(4, 2);
    const std::filesystem::path groupPath = m_databasePath / "articles"
        / tok1 / tok2 / tok3;
    if (mkdirs)
    {
        if (!std::filesystem::exists(groupPath))
        {
            std::filesystem::create_directories(groupPath);
        }
    }
    const auto groupFile = md5 + ".db";
    return groupPath / groupFile;
}

bool Database::GetArticleEntry(
    const std::string& subToken,
    const std::string& searchString,
    ArticleEntry& entry,
    size_t& startPosition,
    size_t& endPosition,
    size_t& count)
{
    const auto path = GetTokenFilePath(subToken);
    if (!std::filesystem::exists(path)) return false;

    SerializableFile io;

    LockFile(path);
    ScopeExit closeAndUnlockFile([&](){
        if (io.is_open()) io.close();
        UnlockFile(path);
    });

    io.open(path, std::ios::binary | std::ios::in);
    if (!io.is_open()) return false;

    std::uint64_t articleCount{0};
    io >> articleCount;
    size_t startPos{0};
    size_t endPos{0};
    for (std::uint64_t i = 0; i != articleCount; ++i)
    {
        ArticleEntry curEntry{};
        startPos = io.tellg();
        io >> curEntry;
        endPos = io.tellg();
        if (curEntry.searchString == searchString)
        {
            entry = curEntry;
            startPosition = startPos;
            endPosition = endPos;
            count = i + 1;
            return true;
        }
    }
    return false;
}

std::filesystem::path Database::GetTokenFilePath(
    const std::string& token,
    bool mkdirs
)
{
    const std::string md5 = StringHash(token);
    const std::string tok1 = md5.substr(0, 2);
    const std::string tok2 = md5.substr(2, 2);
    const std::string tok3 = md5.substr(4, 2);
    const std::filesystem::path groupPath = m_databasePath / "tokens"
        / tok1 / tok2 / tok3;
    if (mkdirs)
    {
        if (!std::filesystem::exists(groupPath))
        {
            std::filesystem::create_directories(groupPath);
        }
    }
    const auto groupFile = md5 + ".db";
    return groupPath / groupFile;
}


bool Database::HasToken(
    const std::string& subToken,
    const std::string& subject,
    std::uint64_t articleID)
{
    const auto path = GetTokenFilePath(subToken);
    if (!std::filesystem::exists(path)) return false;
    LockFile(path);
    ScopeExit unlockFile([&](){ UnlockFile(path); });
    SerializableFile io;
    io.open(path, std::ios::binary | std::ios::in);
    std::uint64_t articleCount{0};
    io.read(reinterpret_cast<char*>(&articleCount), sizeof(articleCount));
    std::uint64_t c = 0;
    for (std::uint64_t i = 0; i != c; ++i)
    {
        ArticleEntry entry;
        io >> entry;
        if (entry.searchString == subject)
        {
            if (std::find(
                entry.articleIDs.begin(),
                entry.articleIDs.end(), articleID) != entry.articleIDs.end())
            {
                return true;
            }
        }
    }
    return false;
}

void Database::LockFile(std::filesystem::path file)
{
#if 1
    while (true)
    {
        {
            std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
            auto it = std::find(m_lockedFiles.begin(), m_lockedFiles.end(), file);
            if (it == m_lockedFiles.end()) break;
        }
        //std::cout << "Waiting on lock: " << file.string() << std::endl;
        //std::this_thread::sleep_for(std::chrono::milliseconds{1000});
        std::this_thread::sleep_for(std::chrono::milliseconds{20});
    }
    {
        std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
        m_lockedFiles.emplace_back(file);
    }
#else
    const std::filesystem::path lockFilePath = file.string() + ".lock";
    while (true)
    {
        if (!std::filesystem::exists(lockFilePath))
        {
            break;
        }
        std::this_thread::sleep_for(std::chrono::milliseconds{20});
    }
    std::ofstream touch(lockFilePath);
#endif
}

void Database::UnlockFile(std::filesystem::path file)
{
#if 1
    std::lock_guard<std::mutex> lock(m_lockedFilesMutex);
    auto it = std::find(m_lockedFiles.begin(), m_lockedFiles.end(), file);
    if (it != m_lockedFiles.end())
    {
        m_lockedFiles.erase(it);
    }
#else
    const std::filesystem::path lockFilePath = file.string() + ".lock";
    if (std::filesystem::exists(lockFilePath))
    {
        std::filesystem::remove(lockFilePath);
    }
#endif
}

std::unique_ptr<std::vector<NntpHeader>> Database::LoadArticleList(
    const std::wstring& newsgroup)
{
    const auto articleFile = GetArticleFilePath(newsgroup);
    if (!std::filesystem::exists(articleFile))
    {
        throw DatabaseException(ENOTFOUND,
            "No article list found for newsgroup " + m_conv.to_bytes(newsgroup)
        );
    }

    LockFile(articleFile);
    ScopeExit unlockFile([&](){ UnlockFile(articleFile); });

    SerializableFile io;
    io.open(articleFile, std::ios::binary | std::ios::in);
    std::uint64_t articleCount;
    io.read(
        reinterpret_cast<char*>(&articleCount),
        sizeof(articleCount)
    );
    auto result = std::make_unique<std::vector<NntpHeader>>();
    for (std::uint64_t i = 0; i != articleCount; ++i)
    {
        NntpHeader header;
        io >> header;
        result->emplace_back(header);
    }
    io.close();
    return result;
}

std::unique_ptr<std::vector<NntpListEntry>> Database::LoadNewsgroupList()
{
    OpenNewsGroupFile();

    std::uint64_t dbVersion{0};
    m_newsGroupFileIO.read(
        reinterpret_cast<char*>(&dbVersion),
        sizeof(dbVersion)
    );
    if (dbVersion != m_databaseVersion)
    {
        throw DatabaseException(EINVAL,
            "The loaded database version (" + std::to_string(dbVersion)
            + ") does not match the current database version ("
            + std::to_string(m_databaseVersion) + ")");
    }

    size_t newsGroupCount{0};
    m_newsGroupFileIO.read(
        reinterpret_cast<char*>(&newsGroupCount),
        sizeof(newsGroupCount)
    );

    auto result = std::make_unique<std::vector<NntpListEntry>>();
    for (size_t numLoaded = 0; numLoaded != newsGroupCount; ++numLoaded)
    {
        NntpListEntry entry;
        m_newsGroupFileIO >> entry;
        result->emplace_back(entry);
    }
    return result;
}

void Database::Open(std::filesystem::path dbPath)
{
    m_databasePath = dbPath;
    if (!std::filesystem::exists(dbPath))
    {
        std::filesystem::create_directory(dbPath);
    }
    OpenNewsGroupFile();
}

void Database::OpenNewsGroupFile()
{
    if (m_newsGroupFileIO.is_open() && m_newsGroupFileIO.is_open())
    {
        return;
    }
    const std::filesystem::path newsGroupFilePath =
        m_databasePath / "newsgroups.db";
    if (!m_newsGroupFileIO.is_open())
    {
        m_newsGroupFileIO.open(newsGroupFilePath,
            std::ios::binary | std::ios::in | std::ios::out);
    }
}

void Database::UpdateArticleList(
    const std::wstring& newsgroup,
    const std::vector<NntpHeader>& headers)
{
    const auto articleFile = GetArticleFilePath(newsgroup, true);

    LockFile(articleFile);
    ScopeExit unlockFile([&](){
        UnlockFile(articleFile);
    });

    SerializableFile io;
    io.open(articleFile, std::ios::binary | std::ios::out);
    const std::uint64_t articleCount = headers.size();
    io << articleCount;
    for (const auto& header: headers)
    {
        io << header;
    }
    io.close();
}

void Database::UpdateNewsgroupList(const std::vector<NntpListEntry>& list)
{
    OpenNewsGroupFile();

    m_newsGroupFileIO.write(
        reinterpret_cast<const char*>(&m_databaseVersion),
        sizeof(m_databaseVersion)
    );

    const std::uint64_t newsGroupCount = list.size();
    m_newsGroupFileIO << newsGroupCount;

    for (const auto& entry: list)
    {
        m_newsGroupFileIO << entry;
    }
    m_newsGroupFileIO.flush();
}

void Database::SaveSearchTokens(
    std::uint64_t articleID,
    const std::string& searchString)
{
    const std::string sstr(searchString);
    StringTreeOperation(
        sstr,
        " ",
        m_maxTreeDepth,
        [&](const std::string& subToken, const std::string& str){
            try
            {
                SaveToken(subToken, str, articleID);
            }
            catch (const SerializeException& e)
            {
                /// @todo do graceful magic here.
                std::cout << "Broken file for \"" << subToken << "\"" << std::endl;
            }
        }
    );
}

void Database::SaveToken(
    const std::string& subtoken,
    const std::string& subject,
    std::uint64_t articleID)
{
    if (subtoken == "") return;
    if (subtoken == " ") return;
    if (subject == "") return;
    if (subject == " ") return;
    if (HasToken(subtoken, subject, articleID)) return;
    const auto path = GetTokenFilePath(subtoken, true);
    size_t startPos{0};
    size_t endPos{0};
    size_t count{0};
    ArticleEntry entry{};
    bool found = GetArticleEntry(
        subtoken, subject, entry, startPos, endPos, count);

    std::uint64_t entryCount{0};

    // Update existing?
    if (!std::filesystem::exists(path))
    {
        // Creating a new token file is pretty simple.
        entryCount = 1;
        SerializableFile io;
        LockFile(path);
        ScopeExit unlockFile([&](){
            if (io.is_open()) io.close();
            UnlockFile(path);
        });
        io.open(path, std::ios::binary | std::ios::out);
        io << entryCount;
        entry.searchString = subject;
        entry.articleIDs.emplace_back(articleID);
        io << entry;
    }
    else
    {
        // Updating an existing token file is a bit more complicated.
        // See if the search string already exists in the token file.
        // Open file.
        SerializableFile io;
        io.open(path, std::ios::binary | std::ios::in | std::ios::out);
        LockFile(path);
        ScopeExit unlockFile([&](){
            if (io.is_open()) io.close();
            UnlockFile(path);
        });
        // Read entry count.
        io >> entryCount;
        if (found)
        {
            /* Seek to the end of the entry and read all following entries. */
            io.seekg(endPos);
            std::vector<ArticleEntry> entries;
            for (auto i = count; i != entryCount; ++i)
            {
                ArticleEntry e{};
                io >> e;
                entries.emplace_back(e);
            }
            // Seek back to the start of the existing entry and rewrite it.
            io.seekg(startPos);
            io << entry;
            // Now re-write all following entries.
            for (auto& e: entries)
            {
                io << e;
            }
        }
        else
        {
            // This is a new search string in the token file, append to the end.
            io.seekg(0, std::ios_base::end);
            entry.searchString = subject;
            entry.articleIDs = {articleID};
            io << entry;
            // Increment entry count.
            entryCount++;
            io.seekg(0);
            io << entryCount;
        }
    }
}

} // namespace usenetsearch