1
0
Fork 0
mirror of https://gitlab.com/niansa/qcommsy.git synced 2025-03-06 20:53:33 +01:00
qcommsy/libcommsy.cpp
2020-11-11 17:42:35 +01:00

433 lines
14 KiB
C++

/*
QCommsy
Copyright (C) 2020 niansa
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <iostream>
#include <string>
#include <sstream>
#include <fstream>
#include <vector>
#include <map>
#include <regex>
#include <exception>
#include <csignal>
#include <cstdio>
#include <unistd.h>
#include <gumbo.h>
#include "libcommsy.hpp"
std::string ltrim(const std::string& s) {
static const std::regex lws{"^[[:space:]]*", std::regex_constants::extended};
return std::regex_replace(s, lws, "");
}
std::string rtrim(const std::string& s) {
static const std::regex tws{"[[:space:]]*$", std::regex_constants::extended};
return std::regex_replace(s, tws, "");
}
std::string trim(const std::string& s) {
return ltrim(rtrim(s));
}
std::string get_filename(const std::string& path) {
return path.substr(path.find_last_of("/\\") + 1);
}
std::string clean_spaces(const std::string& s) {
static const std::regex tws{"[ ]{2,}", std::regex_constants::extended};
std::string newstr = std::regex_replace(s, tws, "");
std::replace(newstr.begin(), newstr.end(), '\n', ' ');
newstr.erase(0, 4);
return newstr;
}
std::vector<std::string> merge_strvects(std::vector<std::string> base, const std::vector<std::string> &addition) {
base.insert(base.end(), addition.begin(), addition.end());
return base;
}
extern long curlreq(std::stringstream &responsebuffer, std::string SID, std::string URL);
void gumbo_search_by_attr(std::vector<GumboNode *> *elemvect, GumboNode* node, const std::string& attrname, const std::string& searchword, const GumboTag& expectedtag) {
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
GumboAttribute* hclass;
if (node->v.element.tag == expectedtag &&
(hclass = gumbo_get_attribute(&node->v.element.attributes, attrname.c_str()))) {
if (hclass->value == searchword) {
elemvect->push_back(node);
}
}
GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
gumbo_search_by_attr(elemvect, static_cast<GumboNode*>(children->data[i]), attrname, searchword, expectedtag);
}
}
inline void gumbo_search_by_class(std::vector<GumboNode *> *elemvect, GumboNode* node, const std::string& searchword, const GumboTag& expectedtag) {
return gumbo_search_by_attr(elemvect, node, "class", searchword, expectedtag);
}
GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, const GumboTag& expectedtag) {
std::vector<GumboNode *> elemvect;
gumbo_search_by_attr(&elemvect, node, "id", searchword, expectedtag);
// Use first node found
if (elemvect.size() > 0) {
return elemvect[0];
}
// If no nodes were found, panic()
throw libCommsy::scrapError();
}
void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
if (node->type != GUMBO_NODE_ELEMENT) {
return;
}
if (node->v.element.tag == searchedtag) {
elemvect->push_back(node);
}
GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
gumbo_search_by_tag(elemvect, static_cast<GumboNode*>(children->data[i]), searchedtag);
}
}
std::string gumbo_cleantext(GumboNode* node) {
if (node->type == GUMBO_NODE_TEXT) {
return std::string(node->v.text.text);
} else if (node->type == GUMBO_NODE_ELEMENT &&
node->v.element.tag != GUMBO_TAG_SCRIPT &&
node->v.element.tag != GUMBO_TAG_STYLE) {
std::string contents = "";
GumboVector* children = &node->v.element.children;
for (unsigned int i = 0; i < children->length; ++i) {
const std::string text = gumbo_cleantext(reinterpret_cast<GumboNode*> (children->data[i]));
if (i != 0 && !text.empty()) {
contents.append(" ");
}
contents.append(text);
}
return contents;
} else {
return "";
}
}
std::vector<std::string> gumbo_get_attr(GumboNode *node, const std::string& attrkey, const GumboTag& expected_tag) {
std::vector<std::string> attrvals;
GumboNode *childnode;
GumboVector* children = &node->v.element.children;
std::vector<std::string> toappend;
// Check if current element is already the right one
if (node->v.element.tag == expected_tag) {
// Return this elements wanted attribute key
return {gumbo_get_attribute(&node->v.element.attributes, attrkey.c_str())->value};
}
// Check if This is a node element
else if (node->type != GUMBO_NODE_ELEMENT) {
return {};
}
// Iterate through child nodes
for (unsigned int it = 0; it < children->length; ++it) {
childnode = reinterpret_cast<GumboNode*> (children->data[it]);
if (childnode->v.element.tag == expected_tag) { // If node is the expected tag; use it
attrvals.push_back(gumbo_get_attribute(&childnode->v.element.attributes, attrkey.c_str())->value);
} else if (childnode->type == GUMBO_NODE_ELEMENT) { // Else; iterate through its child nodes
toappend = gumbo_get_attr(childnode, attrkey, expected_tag);
attrvals = merge_strvects(attrvals, toappend);
}
}
// Return the final result
return attrvals;
}
std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) {
GumboNode *childnode;
GumboVector* children = &node->v.element.children;
// Iterate through childs
for (unsigned int it = 0; it < children->length; ++it) {
childnode = reinterpret_cast<GumboNode*> (children->data[it]);
if (childnode->v.element.tag == searchtag) { // If node is the expected tag; check content
return trim(gumbo_cleantext(childnode));
}
}
throw libCommsy::scrapError();
}
namespace libCommsyLowlevel {
auto get_posts(GumboNode *node) {
std::vector<GumboNode *> posts;
gumbo_search_by_class(&posts, node, "uk-comment", GUMBO_TAG_ARTICLE);
return posts;
}
std::string get_post_name(GumboNode *node) {
std::vector<GumboNode *> titlenodes;
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
if (titlenodes.empty())
throw libCommsy::scrapError();
return trim(gumbo_cleantext(titlenodes[0]));
}
std::string get_post_id(GumboNode *node) {
return gumbo_get_attr(node, "data-item-id", GUMBO_TAG_ARTICLE)[0];
}
std::string get_post_meta(GumboNode *node) {
std::vector<GumboNode *> metanodes;
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
if (metanodes.size() < 2)
throw libCommsy::scrapError();
return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
}
std::string get_post_url(GumboNode *node) {
std::vector<GumboNode *> urlnodes;
gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4);
if (urlnodes.empty())
throw libCommsy::scrapError();
return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0];
}
bool get_post_unread(GumboNode *node) {
std::vector<GumboNode *> elems;
gumbo_search_by_class(&elems, node, "cs-comment-change-info", GUMBO_TAG_DIV);
return !elems.empty();
}
taskState::type get_post_taskState(GumboNode *node) {
// Find all elements that could contain the information we need and grab their "class" attribute
std::vector<std::string> divClassAttrs;
divClassAttrs = gumbo_get_attr(node, "class", GUMBO_TAG_I);
// Try to find the information we need
for (const auto& classAttr : divClassAttrs) {
if (classAttr.find("todo") != std::string::npos) {
return taskState::todo;
} else if (classAttr.find("inProgress") != std::string::npos) {
return taskState::inProgress;
} else if (classAttr.find("done") != std::string::npos) {
return taskState::done;
}
}
return taskState::none;
}
std::vector<std::map<std::string, std::string>> get_post_files(GumboNode *node) {
std::vector<GumboNode *> metanodes;
std::vector<std::string> fileurls;
std::vector<std::string> filenames;
std::vector<std::map<std::string, std::string>> filenameurlmap;
std::map<std::string, std::string> tmpmap;
// Get meta nodes
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
// Get URLs
fileurls = gumbo_get_attr(metanodes[2], "href", GUMBO_TAG_A);
// Get filenames
filenames = gumbo_get_attr(metanodes[2], "title", GUMBO_TAG_A);
// Generate map
auto urlit = fileurls.begin();
auto nameit = filenames.begin();
while (true) {
// Break if last item was reached
if (urlit == fileurls.end() or nameit == filenames.end()) {
break;
}
// Generate temporary map
tmpmap = {};
tmpmap[*nameit] = *urlit;
// Append it to the result vector map
filenameurlmap.push_back(tmpmap);
// Get next item in both vectors
urlit++; nameit++;
}
return filenameurlmap;
}
std::string get_post_desc(const std::string& post_url, const std::string& server_sid) {
std::string material_id;
std::stringstream httpcontent;
GumboOutput *post_document;
GumboNode *desc_node;
std::vector<GumboNode *> results;
// Get material ID
material_id = get_filename(post_url);
// Download post
long statuscode = curlreq(httpcontent, server_sid, post_url);
// Check statuscode
if (statuscode != 200) {
throw libCommsy::descDownloadError();
}
// Parse post
post_document = gumbo_parse(httpcontent.str().c_str());
// Get description element
desc_node = gumbo_search_by_id(post_document->root, "description" + material_id, GUMBO_TAG_DIV);
// Extract description
gumbo_search_by_tag(&results, desc_node, GUMBO_TAG_P);
// Cencenate occurencies
std::string result_string;
for (const auto& result : results) {
result_string.append(trim(gumbo_cleantext(result)) + "\n");
}
// Return first occurence
return result_string;
}
};
using namespace libCommsyLowlevel;
// Class functions
bool libCommsy::postExists(unsigned long postID) {
return postID < numposts;
}
commsyPost *libCommsy::getPost(unsigned long postID) {
// Check if post exists
if (not postExists(postID)) {
throw invalidPostError();
}
// Return post pointer
return &posts[postID];
}
std::string *libCommsy::getDescription(unsigned long postID) {
// Get post
commsyPost *thispost = getPost(postID);
// Check if post description was downloaded already
if (thispost->description == "\xFF") {
// Download post
thispost->description = get_post_desc(server_url + thispost->url, server_sid);
}
// Return it
return &thispost->description;
}
libCommsy::libCommsy(const std::string& _server_url, const std::string& _server_sid, const std::string& room, const std::string start_id, const unsigned long max_posts) {
// Define required variables
server_url = _server_url;
server_sid = _server_sid;
lastID = start_id;
std::stringstream httpcontent;
GumboOutput *document;
long statuscode;
numposts = 0;
// Loop until all or max_posts posts are fetched
while (1) {
// Check connection and download feed
try {
statuscode = curlreq(httpcontent, server_sid, server_url + "/room/" + room + "/feed/0/date?lastId=" + lastID);
} catch (std::exception&) {
throw connectionFailError();
}
if (statuscode == 302) {
throw invalidSIDError();
} else if (statuscode == 500) {
throw invalidRoomError();
} else if (statuscode != 200) {
std::cout << "Unhandled status code " << statuscode << std::endl;
throw connectionFailError();
}
// Initialise document
document = gumbo_parse(httpcontent.str().c_str());
// Get posts
auto gumboPosts = get_posts(document->root);
if (gumboPosts.size() == 0) {
// Stop fetching more data
return;
}
// Map posts and their corresponding URL to a number
for (auto it = gumboPosts.begin(); it != gumboPosts.end(); it++) {
// Create post struct
commsyPost thispost;
{
// Get posts name
thispost.name = get_post_name(*it);
// Get posts ID
thispost.id = get_post_id(*it);
// Get posts meta string
thispost.meta = get_post_meta(*it);
// Get if post is unread
thispost.unread = get_post_unread(*it);
// Get posts task state
thispost.taskState = get_post_taskState(*it);
// Get posts URL
thispost.url = get_post_url(*it);
// Get posts files
auto files = get_post_files(*it);
for (const auto& filemap : files) {
for (const auto& [filename, fileurl] : filemap) {
commsyFile thisfile;
{
thisfile.name = filename;
thisfile.url = fileurl;
}
thispost.files.push_back(thisfile);
}
}
}
// Append to posts vector
posts.push_back(thispost);
// Increment post counter
numposts++;
// Get lastID
lastID = posts.back().id;
// Check if maximum amount of posts to load was exceeded
if (max_posts != 0 and numposts >= max_posts) {
// Stop loading more posts
return;
}
}
// Clear buffer just in case we need it later
httpcontent.str(std::string());
}
}