mirror of
https://gitlab.com/niansa/qcommsy.git
synced 2025-03-06 20:53:33 +01:00
Improved scrapper safety
This commit is contained in:
parent
c3d595f68b
commit
6f2c9286e2
2 changed files with 14 additions and 10 deletions
|
@ -32,10 +32,6 @@
|
||||||
|
|
||||||
#include "libcommsy.hpp"
|
#include "libcommsy.hpp"
|
||||||
|
|
||||||
class parsingNoSuchIDError {};
|
|
||||||
class parsingNoSuchTagError {};
|
|
||||||
class descDownloadError {};
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
std::string ltrim(const std::string& s) {
|
std::string ltrim(const std::string& s) {
|
||||||
|
@ -100,7 +96,7 @@ GumboNode *gumbo_search_by_id(GumboNode* node, const std::string& searchword, co
|
||||||
return elemvect[0];
|
return elemvect[0];
|
||||||
}
|
}
|
||||||
// If no nodes were found, panic()
|
// If no nodes were found, panic()
|
||||||
throw parsingNoSuchIDError();
|
throw libCommsy::scrapError();
|
||||||
}
|
}
|
||||||
|
|
||||||
void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
|
void gumbo_search_by_tag(std::vector<GumboNode *> *elemvect, GumboNode* node, const GumboTag& searchedtag) {
|
||||||
|
@ -188,7 +184,7 @@ std::string gumbo_find_text_by_tag(GumboNode *node, const GumboTag& searchtag) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
throw parsingNoSuchTagError();
|
throw libCommsy::scrapError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -201,6 +197,8 @@ auto get_posts(GumboNode *node) {
|
||||||
std::string get_post_name(GumboNode *node) {
|
std::string get_post_name(GumboNode *node) {
|
||||||
std::vector<GumboNode *> titlenodes;
|
std::vector<GumboNode *> titlenodes;
|
||||||
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
||||||
|
if (titlenodes.empty())
|
||||||
|
throw libCommsy::scrapError();
|
||||||
return trim(gumbo_cleantext(titlenodes[0]));
|
return trim(gumbo_cleantext(titlenodes[0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -211,13 +209,17 @@ std::string get_post_id(GumboNode *node) {
|
||||||
std::string get_post_meta(GumboNode *node) {
|
std::string get_post_meta(GumboNode *node) {
|
||||||
std::vector<GumboNode *> metanodes;
|
std::vector<GumboNode *> metanodes;
|
||||||
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
|
gumbo_search_by_class(&metanodes, node, "uk-comment-meta", GUMBO_TAG_DIV);
|
||||||
|
if (metanodes.size() < 2)
|
||||||
|
throw libCommsy::scrapError();
|
||||||
return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
|
return clean_spaces(trim(gumbo_cleantext(metanodes[1])));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string get_post_url(GumboNode *node) {
|
std::string get_post_url(GumboNode *node) {
|
||||||
std::vector<GumboNode *> titlenodes;
|
std::vector<GumboNode *> urlnodes;
|
||||||
gumbo_search_by_class(&titlenodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
gumbo_search_by_class(&urlnodes, node, "uk-comment-title", GUMBO_TAG_H4);
|
||||||
return gumbo_get_attr(titlenodes[0], "href", GUMBO_TAG_A)[0];
|
if (urlnodes.empty())
|
||||||
|
throw libCommsy::scrapError();
|
||||||
|
return gumbo_get_attr(urlnodes[0], "href", GUMBO_TAG_A)[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
bool get_post_unread(GumboNode *node) {
|
bool get_post_unread(GumboNode *node) {
|
||||||
|
@ -293,7 +295,7 @@ std::string get_post_desc(const std::string& post_url, const std::string& server
|
||||||
long statuscode = curlreq(httpcontent, server_sid, post_url);
|
long statuscode = curlreq(httpcontent, server_sid, post_url);
|
||||||
// Check statuscode
|
// Check statuscode
|
||||||
if (statuscode != 200) {
|
if (statuscode != 200) {
|
||||||
throw descDownloadError();
|
throw libCommsy::descDownloadError();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse post
|
// Parse post
|
||||||
|
|
|
@ -65,4 +65,6 @@ public:
|
||||||
class invalidRoomError {};
|
class invalidRoomError {};
|
||||||
class invalidPostError {};
|
class invalidPostError {};
|
||||||
class connectionFailError {};
|
class connectionFailError {};
|
||||||
|
class descDownloadError {};
|
||||||
|
class scrapError {};
|
||||||
};
|
};
|
||||||
|
|
Loading…
Add table
Reference in a new issue