#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using json = nlohmann::json; #include class invalidSIDError : public std::exception {}; class invalidRoomError : public std::exception {}; class invalidPostError : public std::exception {}; class connectionFailError : public std::exception {}; class parsingNoSuchIDError : public std::exception {}; class parsingNoSuchTagError : public std::exception {}; class descDownloadError : public std::exception {}; static std::string server_url; static std::string server_sid; static std::string room; std::string ltrim(const std::string& s) { static const std::regex lws{"^[[:space:]]*", std::regex_constants::extended}; return std::regex_replace(s, lws, ""); } std::string rtrim(const std::string& s) { static const std::regex tws{"[[:space:]]*$", std::regex_constants::extended}; return std::regex_replace(s, tws, ""); } std::string trim(const std::string& s) { return ltrim(rtrim(s)); } std::string get_filename(const std::string& path) { return path.substr(path.find_last_of("/\\") + 1); } std::string clean_spaces(const std::string& s) { static const std::regex tws{"[ ]{2,}", std::regex_constants::extended}; std::string newstr = std::regex_replace(s, tws, ""); std::replace(newstr.begin(), newstr.end(), '\n', ' '); newstr.erase(0, 4); return newstr; } std::vector merge_strvects(std::vector base, const std::vector &addition) { base.insert(base.end(), addition.begin(), addition.end()); return base; } static long curlreq(std::stringstream &responsebuffer, std::string SID, std::string URL) { // Initialise variables curlpp::Cleanup cleaner; curlpp::Easy request; // Set the writer callback to enable cURL to write result in a memory area request.setOpt(new curlpp::options::WriteStream(&responsebuffer)); // Setting the URL to retrive. request.setOpt(new curlpp::options::Url(URL)); // Set SID cookie std::list header; header.push_back("Cookie: SID=" + SID); request.setOpt(new curlpp::options::HttpHeader(header)); // Perform request request.perform(); // Return result return curlpp::infos::ResponseCode::get(request); } void gumbo_search_by_attr(std::vector &elemvect, GumboNode* node, std::string attrname, std::string searchword, GumboTag expectedtag) { if (node->type != GUMBO_NODE_ELEMENT) { return; } GumboAttribute* hclass; if (node->v.element.tag == expectedtag && (hclass = gumbo_get_attribute(&node->v.element.attributes, attrname.c_str()))) { if (hclass->value == searchword) { elemvect.push_back(node); } } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { gumbo_search_by_attr(elemvect, static_cast(children->data[i]), attrname, searchword, expectedtag); } } void gumbo_search_by_class(std::vector &elemvect, GumboNode* node, std::string searchword, GumboTag expectedtag) { return gumbo_search_by_attr(elemvect, node, "class", searchword, expectedtag); } GumboNode *gumbo_search_by_id(GumboNode* node, std::string searchword, GumboTag expectedtag) { std::vector elemvect; gumbo_search_by_attr(elemvect, node, "id", searchword, expectedtag); // Use first node found if (elemvect.size() > 0) { return elemvect[0]; } // If no nodes were found, panic() throw parsingNoSuchIDError(); return new GumboNode; } void gumbo_search_by_tag(std::vector &elemvect, GumboNode* node, GumboTag searchedtag) { if (node->type != GUMBO_NODE_ELEMENT) { return; } if (node->v.element.tag == searchedtag) { elemvect.push_back(node); } GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { gumbo_search_by_tag(elemvect, static_cast(children->data[i]), searchedtag); } } static std::string gumbo_cleantext(GumboNode* node) { if (node->type == GUMBO_NODE_TEXT) { return std::string(node->v.text.text); } else if (node->type == GUMBO_NODE_ELEMENT && node->v.element.tag != GUMBO_TAG_SCRIPT && node->v.element.tag != GUMBO_TAG_STYLE) { std::string contents = ""; GumboVector* children = &node->v.element.children; for (unsigned int i = 0; i < children->length; ++i) { const std::string text = gumbo_cleantext(reinterpret_cast (children->data[i])); if (i != 0 && !text.empty()) { contents.append(" "); } contents.append(text); } return contents; } else { return ""; } } std::vector gumbo_get_attr(GumboNode *node, std::string attrkey, GumboTag expected_tag) { std::vector attrvals; GumboNode *childnode; GumboVector* children = &node->v.element.children; std::vector toappend; // Check if current element is already the right one if (node->v.element.tag == expected_tag) { // Return this elements wanted attribute key return {gumbo_get_attribute(&node->v.element.attributes, attrkey.c_str())->value}; } // Check if This is a node element else if (node->type != GUMBO_NODE_ELEMENT) { return {}; } // Iterate through child nodes for (unsigned int it = 0; it < children->length; ++it) { childnode = reinterpret_cast (children->data[it]); if (childnode->v.element.tag == expected_tag) { // If node is the expected tag; use it attrvals.push_back(gumbo_get_attribute(&childnode->v.element.attributes, attrkey.c_str())->value); } else if (childnode->type == GUMBO_NODE_ELEMENT) { // Else; iterate through its child nodes toappend = gumbo_get_attr(childnode, attrkey, expected_tag); attrvals = merge_strvects(attrvals, toappend); } } // Return the final result return attrvals; } std::string gumbo_find_text_by_tag(GumboNode *node, GumboTag searchtag) { GumboNode *childnode; GumboVector* children = &node->v.element.children; // Iterate through childs for (unsigned int it = 0; it < children->length; ++it) { childnode = reinterpret_cast (children->data[it]); if (childnode->v.element.tag == searchtag) { // If node is the expected tag; check content return trim(gumbo_cleantext(childnode)); } } throw parsingNoSuchTagError(); return ""; } auto get_posts(GumboNode *node) { std::vector posts; gumbo_search_by_class(posts, node, "uk-comment", GUMBO_TAG_ARTICLE); return posts; } std::string get_post_name(GumboNode *node) { std::vector titlenodes; gumbo_search_by_class(titlenodes, node, "uk-comment-title", GUMBO_TAG_H4); return trim(gumbo_cleantext(titlenodes[0])); } std::string get_post_id(GumboNode *node) { return gumbo_get_attr(node, "data-item-id", GUMBO_TAG_ARTICLE)[0]; } std::string get_post_meta(GumboNode *node) { std::vector metanodes; gumbo_search_by_class(metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV); return clean_spaces(trim(gumbo_cleantext(metanodes[1]))); } std::string get_post_url(GumboNode *node) { std::vector titlenodes; gumbo_search_by_class(titlenodes, node, "uk-comment-title", GUMBO_TAG_H4); return gumbo_get_attr(titlenodes[0], "href", GUMBO_TAG_A)[0]; } std::vector> get_post_files(GumboNode *node) { std::vector metanodes; std::vector fileurls; std::vector filenames; std::vector> filenameurlmap; std::map tmpmap; // Get meta nodes gumbo_search_by_class(metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV); // Get URLs fileurls = gumbo_get_attr(metanodes[2], "href", GUMBO_TAG_A); // Get filenames filenames = gumbo_get_attr(metanodes[2], "title", GUMBO_TAG_A); // Generate map auto urlit = fileurls.begin(); auto nameit = filenames.begin(); while (true) { // Break if last item was reached if (urlit == fileurls.end() or nameit == filenames.end()) { break; } // Generate temporary map tmpmap = {}; tmpmap[*nameit] = *urlit; // Append it to the result vector map filenameurlmap.push_back(tmpmap); // Get next item in both vectors urlit++; nameit++; } return filenameurlmap; } std::string get_post_desc(std::string post_url) { std::string material_id; std::stringstream httpcontent; GumboOutput *post_document; GumboNode *desc_node; std::vector results; // Get material ID material_id = get_filename(post_url); // Download post long statuscode = curlreq(httpcontent, server_sid, post_url); // Check statuscode if (statuscode != 200) { throw descDownloadError(); } // Parse post post_document = gumbo_parse(httpcontent.str().c_str()); // Get description element desc_node = gumbo_search_by_id(post_document->root, "description" + material_id, GUMBO_TAG_DIV); // Extract description gumbo_search_by_tag(results, desc_node, GUMBO_TAG_P); // Cencenate occurencies std::string result_string; for (auto it = results.begin(); it != results.end(); it++) { result_string.append(trim(gumbo_cleantext(*it)) + "\n"); } // Return first occurence return result_string; } struct commsyPost { std::string name; std::string id; std::string description; std::string meta; std::string url; std::map files; }; #define libCommsy_NAME "libcommsy" #define libCommsy_VERSION "1.0" class libCommsy { public: std::vector posts; unsigned long numposts; bool postExists(unsigned long postID) { return postID < numposts; } commsyPost *getPost(unsigned long postID) { // Check if post exists if (not postExists(postID)) { throw invalidPostError(); } // Return post pointer return &posts[postID]; } std::string *getDescription(unsigned long postID) { // Get post commsyPost *thispost = getPost(postID); // Check if post description was downloaded already if (thispost->description.empty()) { // Download post thispost->description = get_post_desc(server_url + thispost->url); } // Return it return &thispost->description; } libCommsy(std::string _server_url, std::string _server_sid, std::string _room) { // Define required variables server_url = _server_url; server_sid = _server_sid; room = _room; std::string lastID; std::stringstream httpcontent; GumboOutput *document; long statuscode; numposts = 0; while (1) { // Check connection and download feed try { statuscode = curlreq(httpcontent, server_sid, server_url + "/room/" + room + "/feed/10/date?lastId=" + lastID); } catch (std::exception&) { throw connectionFailError(); } if (statuscode == 302) { throw invalidSIDError(); } else if (statuscode == 500) { throw invalidRoomError(); } else if (statuscode != 200) { throw connectionFailError(); } // Do some stuff document = gumbo_parse(httpcontent.str().c_str()); httpcontent.str(std::string()); // Clear buffer just in case we need it later // Get posts auto gumboPosts = get_posts(document->root); if (gumboPosts.size() == 0) { // Stop fetching more data break; } // Map posts and their corresponding URL to a number for (auto it = gumboPosts.begin(); it != gumboPosts.end(); it++) { // Create post struct commsyPost thispost; { // Get posts name thispost.name = get_post_name(*it); // Get posts ID thispost.id = get_post_id(*it); // Get posts meta string thispost.meta = get_post_meta(*it); // Get posts URL thispost.url = get_post_url(*it); // Get posts files auto files = get_post_files(*it); for (const auto& filemap : files) { thispost.files.insert(filemap.begin(), filemap.end()); } } // Append to posts vector posts.push_back(thispost); // Increment post counter numposts++; // Get lastID lastID = posts.back().id; } } } };