/* QCommsy Copyright (C) 2020 niansa This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include "libcommsy.hpp" std::string ltrim(const std::string& s) { static const std::regex lws{"^[[:space:]]*", std::regex_constants::extended}; return std::regex_replace(s, lws, ""); } std::string rtrim(const std::string& s) { static const std::regex tws{"[[:space:]]*$", std::regex_constants::extended}; return std::regex_replace(s, tws, ""); } std::string trim(const std::string& s) { return ltrim(rtrim(s)); } std::string get_filename(const std::string& path) { return path.substr(path.find_last_of("/\\") + 1); } std::string clean_spaces(const std::string& s) { static const std::regex tws{"[ ]{2,}", std::regex_constants::extended}; std::string newstr = std::regex_replace(s, tws, ""); std::replace(newstr.begin(), newstr.end(), '\n', ' '); newstr.erase(0, 4); return newstr; } std::vector merge_strvects(std::vector base, const std::vector &addition) { base.insert(base.end(), addition.begin(), addition.end()); return base; } extern long curlreq(std::stringstream &responsebuffer, std::string SID, std::string URL); void gumbo_search_by_attr(std::vector *elemvect, GumboNode* node, const std::string& attrname, const std::string& searchword, const GumboTag& expectedtag) { if (node->type != GUMBO_NODE_ELEMENT) { return; } GumboAttribute* hclass; if (node->v.element.tag == expectedtag && (hclass = gumbo_get_attribute(&node->v.element.attributes, attrname.c_str()))) { if (hclass->value == searchword) { elemvect->push_back(node); } } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { gumbo_search_by_attr(elemvect, static_cast(children->data[i]), attrname, searchword, expectedtag); } } inline void gumbo_search_by_class(std::vector *elemvect, GumboNode* node, const std::string& searchword, const GumboTag& expectedtag) { return gumbo_search_by_attr(elemvect, node, "class", searchword, expectedtag); } GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, const GumboTag& expectedtag) { std::vector elemvect; gumbo_search_by_attr(&elemvect, node, "id", searchword, expectedtag); // Use first node found if (elemvect.size() > 0) { return elemvect[0]; } // If no nodes were found, panic() throw libCommsy::scrapError(); } void gumbo_search_by_tag(std::vector *elemvect, GumboNode* node, const GumboTag& searchedtag) { if (node->type != GUMBO_NODE_ELEMENT) { return; } if (node->v.element.tag == searchedtag) { elemvect->push_back(node); } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { gumbo_search_by_tag(elemvect, static_cast(children->data[i]), searchedtag); } } std::string gumbo_cleantext(GumboNode* node) { if (node->type == GUMBO_NODE_TEXT) { return std::string(node->v.text.text); } else if (node->type == GUMBO_NODE_ELEMENT && node->v.element.tag != GUMBO_TAG_SCRIPT && node->v.element.tag != GUMBO_TAG_STYLE) { std::string contents = ""; GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { const std::string text = gumbo_cleantext(reinterpret_cast (children->data[i])); if (i != 0 && !text.empty()) { contents.append(" "); } contents.append(text); } return contents; } else { return ""; } } std::vector gumbo_get_attr(GumboNode *node, const std::string& attrkey, const GumboTag& expected_tag) { std::vector attrvals; GumboNode *childnode; GumboVector* children = &node->v.element.children; std::vector toappend; // Check if current element is already the right one if (node->v.element.tag == expected_tag) { // Return this elements wanted attribute key return {gumbo_get_attribute(&node->v.element.attributes, attrkey.c_str())->value}; } // Check if This is a node element else if (node->type != GUMBO_NODE_ELEMENT) { return {}; } // Iterate through child nodes for (unsigned int it = 0; it < children->length; ++it) { childnode = reinterpret_cast (children->data[it]); if (childnode->v.element.tag == expected_tag) { // If node is the expected tag; use it attrvals.push_back(gumbo_get_attribute(&childnode->v.element.attributes, attrkey.c_str())->value); } else if (childnode->type == GUMBO_NODE_ELEMENT) { // Else; iterate through its child nodes toappend = gumbo_get_attr(childnode, attrkey, expected_tag); attrvals = merge_strvects(attrvals, toappend); } } // Return the final result return attrvals; } std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) { GumboNode *childnode; GumboVector* children = &node->v.element.children; // Iterate through childs for (unsigned int it = 0; it < children->length; ++it) { childnode = reinterpret_cast (children->data[it]); if (childnode->v.element.tag == searchtag) { // If node is the expected tag; check content return trim(gumbo_cleantext(childnode)); } } throw libCommsy::scrapError(); } namespace libCommsyLowlevel { auto get_posts(GumboNode *node) { std::vector posts; gumbo_search_by_class(&posts, node, "uk-comment", GUMBO_TAG_ARTICLE); return posts; } std::string get_post_name(GumboNode *node) { std::vector titlenodes; gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4); if (titlenodes.empty()) throw libCommsy::scrapError(); return trim(gumbo_cleantext(titlenodes[0])); } std::string get_post_id(GumboNode *node) { return gumbo_get_attr(node, "data-item-id", GUMBO_TAG_ARTICLE)[0]; } std::string get_post_meta(GumboNode *node) { std::vector metanodes; gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV); if (metanodes.size() < 2) throw libCommsy::scrapError(); return clean_spaces(trim(gumbo_cleantext(metanodes[1]))); } std::string get_post_url(GumboNode *node) { std::vector urlnodes; gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4); if (urlnodes.empty()) throw libCommsy::scrapError(); return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0]; } bool get_post_unread(GumboNode *node) { std::vector elems; gumbo_search_by_class(&elems, node, "cs-comment-change-info", GUMBO_TAG_DIV); return !elems.empty(); } taskState::type get_post_taskState(GumboNode *node) { // Find all elements that could contain the information we need and grab their "class" attribute std::vector divClassAttrs; divClassAttrs = gumbo_get_attr(node, "class", GUMBO_TAG_I); // Try to find the information we need for (const auto& classAttr : divClassAttrs) { if (classAttr.find("todo") != std::string::npos) { return taskState::todo; } else if (classAttr.find("inProgress") != std::string::npos) { return taskState::inProgress; } else if (classAttr.find("done") != std::string::npos) { return taskState::done; } } return taskState::none; } std::vector> get_post_files(GumboNode *node) { std::vector metanodes; std::vector fileurls; std::vector filenames; std::vector> filenameurlmap; std::map tmpmap; // Get meta nodes gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV); // Get URLs fileurls = gumbo_get_attr(metanodes[2], "href", GUMBO_TAG_A); // Get filenames filenames = gumbo_get_attr(metanodes[2], "title", GUMBO_TAG_A); // Generate map auto urlit = fileurls.begin(); auto nameit = filenames.begin(); while (true) { // Break if last item was reached if (urlit == fileurls.end() or nameit == filenames.end()) { break; } // Generate temporary map tmpmap = {}; tmpmap[*nameit] = *urlit; // Append it to the result vector map filenameurlmap.push_back(tmpmap); // Get next item in both vectors urlit++; nameit++; } return filenameurlmap; } std::string get_post_desc(const std::string& post_url, const std::string& server_sid) { std::string material_id; std::stringstream httpcontent; GumboOutput *post_document; GumboNode *desc_node; std::vector results; // Get material ID material_id = get_filename(post_url); // Download post long statuscode = curlreq(httpcontent, server_sid, post_url); // Check statuscode if (statuscode != 200) { throw libCommsy::descDownloadError(); } // Parse post post_document = gumbo_parse(httpcontent.str().c_str()); // Get description element desc_node = gumbo_search_by_id(post_document->root, "description" + material_id, GUMBO_TAG_DIV); // Extract description gumbo_search_by_tag(&results, desc_node, GUMBO_TAG_P); // Cencenate occurencies std::string result_string; for (const auto& result : results) { result_string.append(trim(gumbo_cleantext(result)) + "\n"); } // Return first occurence return result_string; } }; using namespace libCommsyLowlevel; // Class functions bool libCommsy::postExists(unsigned long postID) { return postID < numposts; } commsyPost *libCommsy::getPost(unsigned long postID) { // Check if post exists if (not postExists(postID)) { throw invalidPostError(); } // Return post pointer return &posts[postID]; } std::string *libCommsy::getDescription(unsigned long postID) { // Get post commsyPost *thispost = getPost(postID); // Check if post description was downloaded already if (thispost->description == "\xFF") { // Download post thispost->description = get_post_desc(server_url + thispost->url, server_sid); } // Return it return &thispost->description; } libCommsy::libCommsy(const std::string& _server_url, const std::string& _server_sid, const std::string& room, const std::string start_id, const unsigned long max_posts) { // Define required variables server_url = _server_url; server_sid = _server_sid; lastID = start_id; std::stringstream httpcontent; GumboOutput *document; long statuscode; numposts = 0; // Loop until all or max_posts posts are fetched while (1) { // Check connection and download feed try { statuscode = curlreq(httpcontent, server_sid, server_url + "/room/" + room + "/feed/0/date?lastId=" + lastID); } catch (std::exception&) { throw connectionFailError(); } if (statuscode == 302) { throw invalidSIDError(); } else if (statuscode == 500) { throw invalidRoomError(); } else if (statuscode != 200) { std::cout << "Unhandled status code " << statuscode << std::endl; throw connectionFailError(); } // Initialise document document = gumbo_parse(httpcontent.str().c_str()); // Get posts auto gumboPosts = get_posts(document->root); if (gumboPosts.size() == 0) { // Stop fetching more data return; } // Map posts and their corresponding URL to a number for (auto it = gumboPosts.begin(); it != gumboPosts.end(); it++) { // Create post struct commsyPost thispost; { // Get posts name thispost.name = get_post_name(*it); // Get posts ID thispost.id = get_post_id(*it); // Get posts meta string thispost.meta = get_post_meta(*it); // Get if post is unread thispost.unread = get_post_unread(*it); // Get posts task state thispost.taskState = get_post_taskState(*it); // Get posts URL thispost.url = get_post_url(*it); // Get posts files auto files = get_post_files(*it); for (const auto& filemap : files) { for (const auto& [filename, fileurl] : filemap) { commsyFile thisfile; { thisfile.name = filename; thisfile.url = fileurl; } thispost.files.push_back(thisfile); } } } // Append to posts vector posts.push_back(thispost); // Increment post counter numposts++; // Get lastID lastID = posts.back().id; // Check if maximum amount of posts to load was exceeded if (max_posts != 0 and numposts >= max_posts) { // Stop loading more posts return; } } // Clear buffer just in case we need it later httpcontent.str(std::string()); } }