1
0
Fork 0
mirror of https://gitlab.com/niansa/qcommsy.git synced 2025-03-06 20:53:33 +01:00

Improved scrapper safety

This commit is contained in:
niansa 2020-11-11 15:35:51 +01:00
parent c3d595f68b
commit 6f2c9286e2
2 changed files with 14 additions and 10 deletions

View file

@ -32,10 +32,6 @@
#include "libcommsy.hpp" #include "libcommsy.hpp"
class parsingNoSuchIDError {};
class parsingNoSuchTagError {};
class descDownloadError {};
std::string ltrim(const std::string& s) { std::string ltrim(const std::string& s) {
@ -100,7 +96,7 @@ GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, co
return elemvect[0]; return elemvect[0];
} }
// If no nodes were found, panic() // If no nodes were found, panic()
throw parsingNoSuchIDError(); throw libCommsy::scrapError();
} }
void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) { void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
@ -188,7 +184,7 @@ std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) {
} }
} }
throw parsingNoSuchTagError(); throw libCommsy::scrapError();
} }
@ -201,6 +197,8 @@ auto get_posts(GumboNode *node) {
std::string get_post_name(GumboNode *node) { std::string get_post_name(GumboNode *node) {
std::vector<GumboNode *> titlenodes; std::vector<GumboNode *> titlenodes;
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4); gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
if (titlenodes.empty())
throw libCommsy::scrapError();
return trim(gumbo_cleantext(titlenodes[0])); return trim(gumbo_cleantext(titlenodes[0]));
} }
@ -211,13 +209,17 @@ std::string get_post_id(GumboNode *node) {
std::string get_post_meta(GumboNode *node) { std::string get_post_meta(GumboNode *node) {
std::vector<GumboNode *> metanodes; std::vector<GumboNode *> metanodes;
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV); gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
if (metanodes.size() < 2)
throw libCommsy::scrapError();
return clean_spaces(trim(gumbo_cleantext(metanodes[1]))); return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
} }
std::string get_post_url(GumboNode *node) { std::string get_post_url(GumboNode *node) {
std::vector<GumboNode *> titlenodes; std::vector<GumboNode *> urlnodes;
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4); gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4);
return gumbo_get_attr(titlenodes[0], "href", GUMBO_TAG_A)[0]; if (urlnodes.empty())
throw libCommsy::scrapError();
return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0];
} }
bool get_post_unread(GumboNode *node) { bool get_post_unread(GumboNode *node) {
@ -293,7 +295,7 @@ std::string get_post_desc(const std::string& post_url, const std::string& server
long statuscode = curlreq(httpcontent, server_sid, post_url); long statuscode = curlreq(httpcontent, server_sid, post_url);
// Check statuscode // Check statuscode
if (statuscode != 200) { if (statuscode != 200) {
throw descDownloadError(); throw libCommsy::descDownloadError();
} }
// Parse post // Parse post

View file

@ -65,4 +65,6 @@ public:
class invalidRoomError {}; class invalidRoomError {};
class invalidPostError {}; class invalidPostError {};
class connectionFailError {}; class connectionFailError {};
class descDownloadError {};
class scrapError {};
}; };