mirror of
https://gitlab.com/niansa/qcommsy.git
synced 2025-03-06 20:53:33 +01:00
Improved scrapper safety
This commit is contained in:
parent
c3d595f68b
commit
6f2c9286e2
2 changed files with 14 additions and 10 deletions
|
@ -32,10 +32,6 @@
|
|||
|
||||
#include "libcommsy.hpp"
|
||||
|
||||
class parsingNoSuchIDError {};
|
||||
class parsingNoSuchTagError {};
|
||||
class descDownloadError {};
|
||||
|
||||
|
||||
|
||||
std::string ltrim(const std::string& s) {
|
||||
|
@ -100,7 +96,7 @@ GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, co
|
|||
return elemvect[0];
|
||||
}
|
||||
// If no nodes were found, panic()
|
||||
throw parsingNoSuchIDError();
|
||||
throw libCommsy::scrapError();
|
||||
}
|
||||
|
||||
void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
|
||||
|
@ -188,7 +184,7 @@ std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) {
|
|||
}
|
||||
}
|
||||
|
||||
throw parsingNoSuchTagError();
|
||||
throw libCommsy::scrapError();
|
||||
}
|
||||
|
||||
|
||||
|
@ -201,6 +197,8 @@ auto get_posts(GumboNode *node) {
|
|||
std::string get_post_name(GumboNode *node) {
|
||||
std::vector<GumboNode *> titlenodes;
|
||||
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
||||
if (titlenodes.empty())
|
||||
throw libCommsy::scrapError();
|
||||
return trim(gumbo_cleantext(titlenodes[0]));
|
||||
}
|
||||
|
||||
|
@ -211,13 +209,17 @@ std::string get_post_id(GumboNode *node) {
|
|||
std::string get_post_meta(GumboNode *node) {
|
||||
std::vector<GumboNode *> metanodes;
|
||||
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
|
||||
if (metanodes.size() < 2)
|
||||
throw libCommsy::scrapError();
|
||||
return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
|
||||
}
|
||||
|
||||
std::string get_post_url(GumboNode *node) {
|
||||
std::vector<GumboNode *> titlenodes;
|
||||
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
||||
return gumbo_get_attr(titlenodes[0], "href", GUMBO_TAG_A)[0];
|
||||
std::vector<GumboNode *> urlnodes;
|
||||
gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
||||
if (urlnodes.empty())
|
||||
throw libCommsy::scrapError();
|
||||
return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0];
|
||||
}
|
||||
|
||||
bool get_post_unread(GumboNode *node) {
|
||||
|
@ -293,7 +295,7 @@ std::string get_post_desc(const std::string& post_url, const std::string& server
|
|||
long statuscode = curlreq(httpcontent, server_sid, post_url);
|
||||
// Check statuscode
|
||||
if (statuscode != 200) {
|
||||
throw descDownloadError();
|
||||
throw libCommsy::descDownloadError();
|
||||
}
|
||||
|
||||
// Parse post
|
||||
|
|
|
@ -65,4 +65,6 @@ public:
|
|||
class invalidRoomError {};
|
||||
class invalidPostError {};
|
||||
class connectionFailError {};
|
||||
class descDownloadError {};
|
||||
class scrapError {};
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue