qcommsy/libcommsy.cpp

/*
    QCommsy
    Copyright (C) 2020  niansa

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <regex>
#include <exception>

#include <csignal>
#include <cstdio>
#include <unistd.h>

#include <gumbo.h>

#include "libcommsy.hpp"


std::string ltrim(const std::string& s) {
    static const std::regex lws{"^[[:space:]]*", std::regex_constants::extended};
    return std::regex_replace(s, lws, "");
}
std::string rtrim(const std::string& s) {
    static const std::regex tws{"[[:space:]]*$", std::regex_constants::extended};
    return std::regex_replace(s, tws, "");
}
std::string trim(const std::string& s) {
    return ltrim(rtrim(s));
}

std::string get_filename(const std::string& path) {
    return path.substr(path.find_last_of("/\\") + 1);
}

std::string clean_spaces(const std::string& s) {
    static const std::regex tws{"[ ]{2,}", std::regex_constants::extended};
    std::string newstr = std::regex_replace(s, tws, "");
    std::replace(newstr.begin(), newstr.end(), '\n', ' ');
    newstr.erase(0, 4);
    return newstr;
}

std::vector<std::string> merge_strvects(std::vector<std::string> base, const std::vector<std::string> &addition) {
    base.insert(base.end(), addition.begin(), addition.end());
    return base;
}

extern long curlreq(std::stringstream &responsebuffer, std::string SID, std::string URL);

void gumbo_search_by_attr(std::vector<GumboNode *> *elemvect, GumboNode* node, const std::string& attrname, const std::string& searchword, const GumboTag& expectedtag) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }

    GumboAttribute* hclass;
    if (node->v.element.tag == expectedtag &&
        (hclass = gumbo_get_attribute(&node->v.element.attributes, attrname.c_str()))) {
        if (hclass->value == searchword) {
            elemvect->push_back(node);
        }
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        gumbo_search_by_attr(elemvect, static_cast<GumboNode*>(children->data[i]), attrname, searchword, expectedtag);
    }
}

inline void gumbo_search_by_class(std::vector<GumboNode *> *elemvect, GumboNode* node, const std::string& searchword, const GumboTag& expectedtag) {
    return gumbo_search_by_attr(elemvect, node, "class", searchword, expectedtag);
}

GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, const GumboTag& expectedtag) {
    std::vector<GumboNode *> elemvect;
    gumbo_search_by_attr(&elemvect, node, "id", searchword, expectedtag);
    // Use first node found
    if (elemvect.size() > 0) {
        return elemvect[0];
    }
    // If no nodes were found, panic()
    throw libCommsy::scrapError();
}

void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }

    if (node->v.element.tag == searchedtag) {
        elemvect->push_back(node);
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        gumbo_search_by_tag(elemvect, static_cast<GumboNode*>(children->data[i]), searchedtag);
    }
}

std::string gumbo_cleantext(GumboNode* node) {
  if (node->type == GUMBO_NODE_TEXT) {
        return std::string(node->v.text.text);

  } else if (node->type == GUMBO_NODE_ELEMENT &&
             node->v.element.tag != GUMBO_TAG_SCRIPT &&
             node->v.element.tag != GUMBO_TAG_STYLE) {

    std::string contents = "";
    GumboVector* children = &node->v.element.children;

    for (unsigned int i = 0; i < children->length; ++i) {
        const std::string text = gumbo_cleantext(reinterpret_cast<GumboNode*> (children->data[i]));
        if (i != 0 && !text.empty()) {
                contents.append(" ");
        }
        contents.append(text);
    }

    return contents;

  } else {
      return "";
  }
}

std::vector<std::string> gumbo_get_attr(GumboNode *node, const std::string& attrkey, const GumboTag& expected_tag) {
    std::vector<std::string> attrvals;
    GumboNode *childnode;
    GumboVector* children = &node->v.element.children;
    std::vector<std::string> toappend;

    // Check if current element is already the right one
    if (node->v.element.tag == expected_tag) {
        // Return this elements wanted attribute key
        return {gumbo_get_attribute(&node->v.element.attributes, attrkey.c_str())->value};
    }

    // Check if This is a node element
    else if (node->type != GUMBO_NODE_ELEMENT) {
        return {};
    }

    // Iterate through child nodes
    for (unsigned int it = 0; it < children->length; ++it) {
        childnode = reinterpret_cast<GumboNode*> (children->data[it]);
        if (childnode->v.element.tag == expected_tag) { // If node is the expected tag; use it
            attrvals.push_back(gumbo_get_attribute(&childnode->v.element.attributes, attrkey.c_str())->value);
        } else if (childnode->type == GUMBO_NODE_ELEMENT) { // Else; iterate through its child nodes
            toappend = gumbo_get_attr(childnode, attrkey, expected_tag);
            attrvals = merge_strvects(attrvals, toappend);
        }
    }

    // Return the final result
    return attrvals;
}

std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) {
    GumboNode *childnode;
    GumboVector* children = &node->v.element.children;

    // Iterate through childs
    for (unsigned int it = 0; it < children->length; ++it) {
        childnode = reinterpret_cast<GumboNode*> (children->data[it]);
        if (childnode->v.element.tag == searchtag) { // If node is the expected tag; check content
            return trim(gumbo_cleantext(childnode));
        }
    }

    throw libCommsy::scrapError();
}


namespace libCommsyLowlevel {
auto get_posts(GumboNode *node) {
    std::vector<GumboNode *> posts;
    gumbo_search_by_class(&posts, node, "uk-comment", GUMBO_TAG_ARTICLE);
    return posts;
}

std::string get_post_name(GumboNode *node) {
    std::vector<GumboNode *> titlenodes;
    gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
    if (titlenodes.empty())
        throw libCommsy::scrapError();
    return trim(gumbo_cleantext(titlenodes[0]));
}

std::string get_post_id(GumboNode *node) {
    return gumbo_get_attr(node, "data-item-id", GUMBO_TAG_ARTICLE)[0];
}

std::string get_post_meta(GumboNode *node) {
    std::vector<GumboNode *> metanodes;
    gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
    if (metanodes.size() < 2)
        throw libCommsy::scrapError();
    return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
}

std::string get_post_url(GumboNode *node) {
    std::vector<GumboNode *> urlnodes;
    gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4);
    if (urlnodes.empty())
        throw libCommsy::scrapError();
    return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0];
}

bool get_post_unread(GumboNode *node) {
    std::vector<GumboNode *> elems;
    gumbo_search_by_class(&elems, node, "cs-comment-change-info", GUMBO_TAG_DIV);
    return !elems.empty();
}

taskState::type get_post_taskState(GumboNode *node) {
    // Find all elements that could contain the information we need and grab their "class" attribute
    std::vector<std::string> divClassAttrs;
    divClassAttrs = gumbo_get_attr(node, "class", GUMBO_TAG_I);

    // Try to find the information we need
    for (const auto& classAttr : divClassAttrs) {
        if (classAttr.find("todo") != std::string::npos) {
            return taskState::todo;
        } else if (classAttr.find("inProgress") != std::string::npos) {
            return taskState::inProgress;
        } else if (classAttr.find("done") != std::string::npos) {
            return taskState::done;
        }
    }

    return taskState::none;
}

std::vector<std::map<std::string, std::string>> get_post_files(GumboNode *node) {
    std::vector<GumboNode *> metanodes;
    std::vector<std::string> fileurls;
    std::vector<std::string> filenames;
    std::vector<std::map<std::string, std::string>> filenameurlmap;
    std::map<std::string, std::string> tmpmap;

    // Get meta nodes
    gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
    // Get URLs
    fileurls = gumbo_get_attr(metanodes[2], "href", GUMBO_TAG_A);
    // Get filenames
    filenames = gumbo_get_attr(metanodes[2], "title", GUMBO_TAG_A);

    // Generate map
    auto urlit = fileurls.begin();
    auto nameit = filenames.begin();
    while (true) {

        // Break if last item was reached
        if (urlit == fileurls.end() or nameit == filenames.end()) {
            break;
        }

        // Generate temporary map
        tmpmap = {};
        tmpmap[*nameit] = *urlit;
        // Append it to the result vector map
        filenameurlmap.push_back(tmpmap);
        // Get next item in both vectors
        urlit++; nameit++;
    }
    return filenameurlmap;
}

std::string get_post_desc(const std::string& post_url, const std::string& server_sid) {
    std::string material_id;
    std::stringstream httpcontent;
    GumboOutput *post_document;
    GumboNode *desc_node;
    std::vector<GumboNode *> results;

    // Get material ID
    material_id = get_filename(post_url);
    // Download post
    long statuscode = curlreq(httpcontent, server_sid, post_url);
    // Check statuscode
    if (statuscode != 200) {
        throw libCommsy::descDownloadError();
    }

    // Parse post
    post_document = gumbo_parse(httpcontent.str().c_str());
    // Get description element
    desc_node = gumbo_search_by_id(post_document->root, "description" + material_id, GUMBO_TAG_DIV);
    // Extract description
    gumbo_search_by_tag(&results, desc_node, GUMBO_TAG_P);

    // Cencenate occurencies
    std::string result_string;
    for (const auto& result : results) {
        result_string.append(trim(gumbo_cleantext(result)) + "\n");
    }

    // Return first occurence
    return result_string;
}
};


using namespace libCommsyLowlevel;
// Class functions
bool libCommsy::postExists(unsigned long postID) {
    return postID < numposts;
}

commsyPost *libCommsy::getPost(unsigned long postID) {
    // Check if post exists
    if (not postExists(postID)) {
        throw invalidPostError();
    }
    // Return post pointer
    return &posts[postID];
}

std::string *libCommsy::getDescription(unsigned long postID) {
    // Get post
    commsyPost *thispost = getPost(postID);

    // Check if post description was downloaded already
    if (thispost->description == "\xFF") {
        // Download post
        thispost->description = get_post_desc(server_url + thispost->url, server_sid);
    }

    // Return it
    return &thispost->description;
}

libCommsy::libCommsy(const std::string& _server_url, const std::string& _server_sid, const std::string& room, const std::string start_id, const unsigned long max_posts) {
    // Define required variables
    server_url = _server_url;
    server_sid = _server_sid;
    lastID = start_id;
    std::stringstream httpcontent;
    GumboOutput *document;
    long statuscode;
    numposts = 0;

    // Loop until all or max_posts posts are fetched
    while (1) {
        // Check connection and download feed
        try {
            statuscode = curlreq(httpcontent, server_sid, server_url + "/room/" + room + "/feed/0/date?lastId=" + lastID);
        } catch (std::exception&) {
            throw connectionFailError();
        }
        if (statuscode == 302) {
            throw invalidSIDError();
        } else if (statuscode == 500) {
            throw invalidRoomError();
        } else if (statuscode != 200) {
            std::cout << "Unhandled status code " << statuscode << std::endl;
            throw connectionFailError();
        }

        // Initialise document
        document = gumbo_parse(httpcontent.str().c_str());

        // Get posts
        auto gumboPosts = get_posts(document->root);
        if (gumboPosts.size() == 0) {
            // Stop fetching more data
            return;
        }

        // Map posts and their corresponding URL to a number
        for (auto it = gumboPosts.begin(); it != gumboPosts.end(); it++) {
            // Create post struct
            commsyPost thispost;
            {
                // Get posts name
                thispost.name = get_post_name(*it);
                // Get posts ID
                thispost.id = get_post_id(*it);
                // Get posts meta string
                thispost.meta = get_post_meta(*it);
                // Get if post is unread
                thispost.unread = get_post_unread(*it);
                // Get posts task state
                thispost.taskState = get_post_taskState(*it);
                // Get posts URL
                thispost.url = get_post_url(*it);
                // Get posts files
                auto files = get_post_files(*it);
                for (const auto& filemap : files) {
                    for (const auto& [filename, fileurl] : filemap) {
                        commsyFile thisfile;
                        {
                            thisfile.name = filename;
                            thisfile.url = fileurl;
                        }
                        thispost.files.push_back(thisfile);
                    }
                }
            }

            // Append to posts vector
            posts.push_back(thispost);
            // Increment post counter
            numposts++;
            // Get lastID
            lastID = posts.back().id;
            // Check if maximum amount of posts to load was exceeded
            if (max_posts != 0 and numposts >= max_posts) {
                // Stop loading more posts
                return;
            }
        }
        // Clear buffer just in case we need it later
        httpcontent.str(std::string());
    }
}