1
0
Fork 0
mirror of https://gitlab.com/niansa/qcommsy.git synced 2025-03-06 20:53:33 +01:00

Improved scrapper safety

This commit is contained in:
niansa 2020-11-11 15:35:51 +01:00
parent c3d595f68b
commit 6f2c9286e2
2 changed files with 14 additions and 10 deletions

View file

@ -32,10 +32,6 @@
#include "libcommsy.hpp"
class parsingNoSuchIDError {};
class parsingNoSuchTagError {};
class descDownloadError {};
std::string ltrim(const std::string& s) {
@ -100,7 +96,7 @@ GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, co
return elemvect[0];
}
// If no nodes were found, panic()
throw parsingNoSuchIDError();
throw libCommsy::scrapError();
}
void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
@ -188,7 +184,7 @@ std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) {
}
}
throw parsingNoSuchTagError();
throw libCommsy::scrapError();
}
@ -201,6 +197,8 @@ auto get_posts(GumboNode *node) {
std::string get_post_name(GumboNode *node) {
std::vector<GumboNode *> titlenodes;
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
if (titlenodes.empty())
throw libCommsy::scrapError();
return trim(gumbo_cleantext(titlenodes[0]));
}
@ -211,13 +209,17 @@ std::string get_post_id(GumboNode *node) {
std::string get_post_meta(GumboNode *node) {
std::vector<GumboNode *> metanodes;
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
if (metanodes.size() < 2)
throw libCommsy::scrapError();
return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
}
std::string get_post_url(GumboNode *node) {
std::vector<GumboNode *> titlenodes;
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
return gumbo_get_attr(titlenodes[0], "href", GUMBO_TAG_A)[0];
std::vector<GumboNode *> urlnodes;
gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4);
if (urlnodes.empty())
throw libCommsy::scrapError();
return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0];
}
bool get_post_unread(GumboNode *node) {
@ -293,7 +295,7 @@ std::string get_post_desc(const std::string& post_url, const std::string& server
long statuscode = curlreq(httpcontent, server_sid, post_url);
// Check statuscode
if (statuscode != 200) {
throw descDownloadError();
throw libCommsy::descDownloadError();
}
// Parse post

View file

@ -65,4 +65,6 @@ public:
class invalidRoomError {};
class invalidPostError {};
class connectionFailError {};
class descDownloadError {};
class scrapError {};
};