qcommsy/libcommsy.hpp

#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <regex>
#include <exception>

#include <csignal>
#include <cstdio>
#include <unistd.h>

#include <curlpp/cURLpp.hpp>
#include <curlpp/Easy.hpp>
#include <curlpp/Infos.hpp>
#include <curlpp/Options.hpp>

#include <nlohmann/json.hpp>
using json = nlohmann::json;

#include <gumbo.h>

class invalidSIDError {};
class invalidRoomError {};
class invalidPostError {};
class connectionFailError {};
class parsingNoSuchIDError {};
class parsingNoSuchTagError {};
class descDownloadError {};


static std::string server_url;
static std::string server_sid;
static std::string room;


std::string ltrim(const std::string& s) {
    static const std::regex lws{"^[[:space:]]*", std::regex_constants::extended};
    return std::regex_replace(s, lws, "");
}
std::string rtrim(const std::string& s) {
    static const std::regex tws{"[[:space:]]*$", std::regex_constants::extended};
    return std::regex_replace(s, tws, "");
}
std::string trim(const std::string& s) {
    return ltrim(rtrim(s));
}

std::string get_filename(const std::string& path) {
    return path.substr(path.find_last_of("/\\") + 1);
}

std::string clean_spaces(const std::string& s) {
    static const std::regex tws{"[ ]{2,}", std::regex_constants::extended};
    std::string newstr = std::regex_replace(s, tws, "");
    std::replace(newstr.begin(), newstr.end(), '\n', ' ');
    newstr.erase(0, 4);
    return newstr;
}

std::vector<std::string> merge_strvects(std::vector<std::string> base, const std::vector<std::string> &addition) {
    base.insert(base.end(), addition.begin(), addition.end());
    return base;
}

static long curlreq(std::stringstream &responsebuffer, std::string SID, std::string URL) {
    std::cout << "Connection details begin" << std::endl;
    std::cout << "URL: " << URL << std::endl;
    std::cout << "SID: " << SID << std::endl;
    std::cout << "Connection details end" << std::endl;
    // Initialise variables
    curlpp::Cleanup cleaner;
    curlpp::Easy request;
    // Set the writer callback to enable cURL to write result in a memory area
    request.setOpt(new curlpp::options::WriteStream(&responsebuffer));
    // Setting the URL to retrive.
    request.setOpt(new curlpp::options::Url(URL));
    // Set SID cookie
    std::list<std::string> header;
    header.push_back("Cookie: SID=" + SID);
    request.setOpt(new curlpp::options::HttpHeader(header));
    // Perform request
    request.perform();
    // Return result
    return curlpp::infos::ResponseCode::get(request);
}

void gumbo_search_by_attr(std::vector<GumboNode *> &elemvect, GumboNode* node, std::string attrname, std::string searchword, GumboTag expectedtag) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }

    GumboAttribute* hclass;
    if (node->v.element.tag == expectedtag &&
        (hclass = gumbo_get_attribute(&node->v.element.attributes, attrname.c_str()))) {
        if (hclass->value == searchword) {
            elemvect.push_back(node);
        }
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        gumbo_search_by_attr(elemvect, static_cast<GumboNode*>(children->data[i]), attrname, searchword, expectedtag);
    }
}

void gumbo_search_by_class(std::vector<GumboNode *> &elemvect, GumboNode* node, std::string searchword, GumboTag expectedtag) {
    return gumbo_search_by_attr(elemvect, node, "class", searchword, expectedtag);
}

GumboNode *gumbo_search_by_id(GumboNode* node, std::string searchword, GumboTag expectedtag) {
    std::vector<GumboNode *> elemvect;
    gumbo_search_by_attr(elemvect, node, "id", searchword, expectedtag);
    // Use first node found
    if (elemvect.size() > 0) {
        return elemvect[0];
    }
    // If no nodes were found, panic()
    throw parsingNoSuchIDError();
}

void gumbo_search_by_tag(std::vector<GumboNode *> &elemvect, GumboNode* node, GumboTag searchedtag) {
    if (node->type != GUMBO_NODE_ELEMENT) {
        return;
    }

    if (node->v.element.tag == searchedtag) {
        elemvect.push_back(node);
    }

    GumboVector* children = &node->v.element.children;
    for (unsigned int i = 0; i < children->length; ++i) {
        gumbo_search_by_tag(elemvect, static_cast<GumboNode*>(children->data[i]), searchedtag);
    }
}

static std::string gumbo_cleantext(GumboNode* node) {
  if (node->type == GUMBO_NODE_TEXT) {
        return std::string(node->v.text.text);

  } else if (node->type == GUMBO_NODE_ELEMENT &&
             node->v.element.tag != GUMBO_TAG_SCRIPT &&
             node->v.element.tag != GUMBO_TAG_STYLE) {

    std::string contents = "";
    GumboVector* children = &node->v.element.children;

    for (unsigned int i = 0; i < children->length; ++i) {
        const std::string text = gumbo_cleantext(reinterpret_cast<GumboNode*> (children->data[i]));
        if (i != 0 && !text.empty()) {
                contents.append(" ");
        }
        contents.append(text);
    }

    return contents;

  } else {
      return "";
  }
}

std::vector<std::string> gumbo_get_attr(GumboNode *node, std::string attrkey, GumboTag expected_tag) {
    std::vector<std::string> attrvals;
    GumboNode *childnode;
    GumboVector* children = &node->v.element.children;
    std::vector<std::string> toappend;

    // Check if current element is already the right one
    if (node->v.element.tag == expected_tag) {
        // Return this elements wanted attribute key
        return {gumbo_get_attribute(&node->v.element.attributes, attrkey.c_str())->value};
    }

    // Check if This is a node element
    else if (node->type != GUMBO_NODE_ELEMENT) {
        return {};
    }

    // Iterate through child nodes
    for (unsigned int it = 0; it < children->length; ++it) {
        childnode = reinterpret_cast<GumboNode*> (children->data[it]);
        if (childnode->v.element.tag == expected_tag) { // If node is the expected tag; use it
            attrvals.push_back(gumbo_get_attribute(&childnode->v.element.attributes, attrkey.c_str())->value);
        } else if (childnode->type == GUMBO_NODE_ELEMENT) { // Else; iterate through its child nodes
            toappend = gumbo_get_attr(childnode, attrkey, expected_tag);
            attrvals = merge_strvects(attrvals, toappend);
        }
    }

    // Return the final result
    return attrvals;
}

std::string gumbo_find_text_by_tag(GumboNode *node, GumboTag searchtag) {
    GumboNode *childnode;
    GumboVector* children = &node->v.element.children;

    // Iterate through childs
    for (unsigned int it = 0; it < children->length; ++it) {
        childnode = reinterpret_cast<GumboNode*> (children->data[it]);
        if (childnode->v.element.tag == searchtag) { // If node is the expected tag; check content
            return trim(gumbo_cleantext(childnode));
        }
    }

    throw parsingNoSuchTagError();
}


auto get_posts(GumboNode *node) {
    std::vector<GumboNode *> posts;
    gumbo_search_by_class(posts, node, "uk-comment", GUMBO_TAG_ARTICLE);
    return posts;
}

std::string get_post_name(GumboNode *node) {
    std::vector<GumboNode *> titlenodes;
    gumbo_search_by_class(titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
    return trim(gumbo_cleantext(titlenodes[0]));
}

std::string get_post_id(GumboNode *node) {
    return gumbo_get_attr(node, "data-item-id", GUMBO_TAG_ARTICLE)[0];
}

std::string get_post_meta(GumboNode *node) {
    std::vector<GumboNode *> metanodes;
    gumbo_search_by_class(metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
    return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
}

std::string get_post_url(GumboNode *node) {
    std::vector<GumboNode *> titlenodes;
    gumbo_search_by_class(titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
    return gumbo_get_attr(titlenodes[0], "href", GUMBO_TAG_A)[0];
}

std::vector<std::map<std::string, std::string>> get_post_files(GumboNode *node) {
    std::vector<GumboNode *> metanodes;
    std::vector<std::string> fileurls;
    std::vector<std::string> filenames;
    std::vector<std::map<std::string, std::string>> filenameurlmap;
    std::map<std::string, std::string> tmpmap;

    // Get meta nodes
    gumbo_search_by_class(metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
    // Get URLs
    fileurls = gumbo_get_attr(metanodes[2], "href", GUMBO_TAG_A);
    // Get filenames
    filenames = gumbo_get_attr(metanodes[2], "title", GUMBO_TAG_A);

    // Generate map
    auto urlit = fileurls.begin();
    auto nameit = filenames.begin();
    while (true) {

        // Break if last item was reached
        if (urlit == fileurls.end() or nameit == filenames.end()) {
            break;
        }

        // Generate temporary map
        tmpmap = {};
        tmpmap[*nameit] = *urlit;
        // Append it to the result vector map
        filenameurlmap.push_back(tmpmap);
        // Get next item in both vectors
        urlit++; nameit++;
    }
    return filenameurlmap;
}

std::string get_post_desc(std::string post_url) {
    std::string material_id;
    std::stringstream httpcontent;
    GumboOutput *post_document;
    GumboNode *desc_node;
    std::vector<GumboNode *> results;

    // Get material ID
    material_id = get_filename(post_url);
    // Download post
    long statuscode = curlreq(httpcontent, server_sid, post_url);
    // Check statuscode
    if (statuscode != 200) {
        throw descDownloadError();
    }

    // Parse post
    post_document = gumbo_parse(httpcontent.str().c_str());
    // Get description element
    desc_node = gumbo_search_by_id(post_document->root, "description" + material_id, GUMBO_TAG_DIV);
    // Extract description
    gumbo_search_by_tag(results, desc_node, GUMBO_TAG_P);

    // Cencenate occurencies
    std::string result_string;
    for (auto it = results.begin(); it != results.end(); it++) {
        result_string.append(trim(gumbo_cleantext(*it)) + "\n");
    }

    // Return first occurence
    return result_string;
}


struct commsyPost {
    std::string name;
    std::string id;
    std::string description;
    std::string meta;
    std::string url;
    std::map<std::string, std::string> files;
};


#define libCommsy_NAME "libcommsy"
#define libCommsy_VERSION "1.1-stable"
class libCommsy {
public:
    std::vector<commsyPost> posts;
    unsigned long numposts;
    std::string lastID;

    bool postExists(unsigned long postID) {
        return postID < numposts;
    }

    commsyPost *getPost(unsigned long postID) {
        // Check if post exists
        if (not postExists(postID)) {
            throw invalidPostError();
        }
        // Return post pointer
        return &posts[postID];
    }

    std::string *getDescription(unsigned long postID) {
        // Get post
        commsyPost *thispost = getPost(postID);

        // Check if post description was downloaded already
        if (thispost->description.empty()) {
            // Download post
            thispost->description = get_post_desc(server_url + thispost->url);
        }
        // Return it
        return &thispost->description;
    }

    libCommsy(const std::string& _server_url, const std::string& _server_sid, const std::string& _room, const std::string start_id = "", const unsigned long max_posts = 0) {
        // Define required variables
        server_url = _server_url;
        server_sid = _server_sid;
        room = _room;
        lastID = start_id;
        std::stringstream httpcontent;
        GumboOutput *document;
        long statuscode;
        numposts = 0;

        // Loop until all or max_posts posts are fetched
        while (1) {
            // Check connection and download feed
            try {
                statuscode = curlreq(httpcontent, server_sid, server_url + "/room/" + room + "/feed/10/date?lastId=" + lastID);
            } catch (std::exception&) {
                throw connectionFailError();
            }
            if (statuscode == 302) {
                throw invalidSIDError();
            } else if (statuscode == 500) {
                throw invalidRoomError();
            } else if (statuscode != 200) {
                std::cout << "Unhandled status code " << statuscode << std::endl;
                throw connectionFailError();
            }

            // Do some stuff XD
            document = gumbo_parse(httpcontent.str().c_str());
            httpcontent.str(std::string()); // Clear buffer just in case we need it later

            // Get posts
            auto gumboPosts = get_posts(document->root);
            if (gumboPosts.size() == 0) {
                // Stop fetching more data
                break;
            }

            // Map posts and their corresponding URL to a number
            for (auto it = gumboPosts.begin(); it != gumboPosts.end(); it++) {
                // Create post struct
                commsyPost thispost;
                {
                    // Get posts name
                    thispost.name = get_post_name(*it);
                    // Get posts ID
                    thispost.id = get_post_id(*it);
                    // Get posts meta string
                    thispost.meta = get_post_meta(*it);
                    // Get posts URL
                    thispost.url = get_post_url(*it);
                    // Get posts files
                    auto files = get_post_files(*it);
                    for (const auto& filemap : files) {
                        thispost.files.insert(filemap.begin(), filemap.end());
                    }
                }

                // Append to posts vector
                posts.push_back(thispost);
                // Increment post counter
                numposts++;
                // Get lastID
                lastID = posts.back().id;
                // Check if maximum amount of posts to load was exceeded
                if (numposts == max_posts) {
                    // Stop loading more posts
                    break;
                }
            }
        }
    }
};