#define SC_CAPI(c, b, h, e, ...) sc_api(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__)
#define SC_CAPIX(c, b, h, e, ...) sc_capix(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__)
char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) {
if (!c || !endpoint)
return NULL;
size_t va_count = parse_printf_format(endpoint, 0, NULL);
char * endpoint_formatted = NULL;
long response_code = 0;
if (isfmt && va_count > 0 && endpoint_formatted == NULL) {
va_list ap, ap2;
va_start(ap, endpoint);
va_copy(ap2, ap);
size_t strlenm = vsnprintf(NULL, 0, endpoint, ap);
endpoint_formatted = malloc(sizeof(char)*strlenm+1);
vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2);
va_end(ap);
va_end(ap2);
}
if (!headers)
headers = "";
char * hedrs = malloc(sizeof(char)*strlen(headers)+strlen(SC_HTTP_HEADERS)+1);
strcpy(hedrs, SC_HTTP_HEADERS);
strcat(hedrs, headers);
char * contentType = NULL;
char * redir = NULL;
char * buf = malloc(sizeof(char)*SC_HTTP_RBUFSIZE);
size_t buf_sizeof = SC_HTTP_RBUFSIZE;
size_t buf_length = 0;
int readstatus = 0;
void * r = xmlNanoHTTPMethodRedir(
endpoint_formatted ? endpoint_formatted : endpoint,
body ? "POST" : "GET",
body,
&contentType,
&redir,
hedrs,
body ? strlen(body) : 0
);
if (!r) {
SC_LOG(SC_LOG_ERROR, c, "!r, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
goto rc;
}
response_code = xmlNanoHTTPReturnCode(r);
if (!(response_code - 200 >= 0 && response_code - 200 < 100)) {
SC_LOG(SC_LOG_ERROR, c, "response_code == %ld, endpoint: %s", response_code, endpoint_formatted ? endpoint_formatted:endpoint);
}
while ((readstatus = xmlNanoHTTPRead(r, buf+buf_length, buf_sizeof-buf_length)) > 0) {
buf_length += readstatus;
if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) {
buf_sizeof *= SC_REALLOC_K;
buf = realloc(buf, sizeof(char)*buf_sizeof);
}
}
if (readstatus == -1)
SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
xmlNanoHTTPClose(r);
SC_LOG(SC_LOG_DEBUG, c, "contentType = %s, redir = %s", contentType ? contentType : "NULL", redir ? redir : "NULL");
rc:
free(endpoint_formatted);
free(contentType);
free(redir);
free(hedrs);
return buf;
}
htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) {
if (!c || !endpoint)
return NULL;
size_t va_count = parse_printf_format(endpoint, 0, NULL);
char * endpoint_formatted = NULL;
if (isfmt && va_count > 0 && endpoint_formatted == NULL) {
va_list ap, ap2;
va_start(ap, endpoint);
va_copy(ap2, ap);
size_t strlenm = vsnprintf(NULL, 0, endpoint, ap);
endpoint_formatted = malloc(sizeof(char)*strlenm+1);
vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2);
va_end(ap);
va_end(ap2);
}
char * buf = sc_api(c, body, headers, 0, endpoint_formatted ? endpoint_formatted : endpoint);
htmlDocPtr htmldoc = parseHtmlDocument(buf, endpoint_formatted ? endpoint_formatted : endpoint);
free(buf);
free(endpoint_formatted);
return htmldoc;
}
char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */
if (!haystack || !definition)
return NULL;
char * class = strstr(haystack, definition);
if (!class)
return NULL;
int found = 0;
for (; class > haystack; class--)
if (class[-1] == '.' && (found = 1))
break;
if (!found)
return NULL;
char * endofclass = class;
found = 0;
for (; *endofclass; endofclass++) /* google only has alphanumeric class names. TODO: be pedantic and conformic to w3 stds */
if (!isalnum(endofclass[0]) && (found = 1))
break;
if (!found)
return NULL;
char * toreturn = malloc(endofclass-class+1);
strncpy(toreturn, class, endofclass-class);
toreturn[endofclass-class] = '\0';
return toreturn;
}
int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking nonsense, so resulting URL is shorter or equl) */
if (!h || !*h) /* stage 0: prevent accidental death */
return -1;
if (!strncmp(*h, "/url?q=", strlen("/url?q="))) { /* stage 1: url may be tracking url by google results */
*h = *h+strlen("/url?q=");
*strchrnul(*h, '&') = '\0';
urldecode(*h, *h);
}
char * c = NULL;
if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */
*h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */
*strchrnul(*h, '&') = '\0';
urldecode(*h, *h);
} /* TODO: be pedantic and remove utm_source and other tracking bullshit */
return 1;
}
struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */
/* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
/* if query is not NULL, it MUST be initialized */
/*
remarks:
* we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website
* we determine which class holds a specific value by looking at the css definitions
- result title: the only class that has definition {color:#1967D2;font-size:14px;line-height:16px}
+ A links have this class set, but they have a child SPAN element that then holds the text of the title
+ A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link.
- result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link.
+ extract those two classes and find the one that is only present on SPAN text elements.
- result description: once we have the result div, the description is the //table//span with the appropriate class
+ the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements.
- result div: to get the result div, we need the parent of the parent of the A link of the title.
* result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP.
- we won't parse those yet
* I couldn't find anything with ratings, so we won't parse thouse either yet
* captcha: google knows that this nokia phone we're pretending to be doesn't support javascript
- the request limiting captcha must work on a phone without javascript. it is probably loaded inside an iframe, but has
origin protection, so we can't just solve it client-side. we would have to proxy images and create some sort of a session
based http request-response based user interface so we can ask the user to complete the captcha. this is not yet
implemeted and will be hard work.
*/
int rs;
if (!s || !c) {
rs = -1;
goto rc;
}
int qwasgiven = 0;
if (!q)
q = sc_query_init();
else
qwasgiven++;
char * us = malloc(sizeof(char)*strlen(s)*3+1);
urlencode(us, s);
char * xpath = NULL;
char * descclass = NULL;
char * titleclass = NULL;
char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100", us);
// fprintf(stdout, "%s\n", txtdoc);
free(us);
if (!txtdoc) {
SC_LOG(SC_LOG_ERROR, c, "!txtdoc");
rs = -2;
goto rc;
}
titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
descclass = sc_find_class(txtdoc, "{word-break:break-word}");
if (!titleclass || !descclass) {
SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass");
rs = -3;
goto rc;
}
#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */
#define SC_GTXD /* description */ "../..//table//span[@class='%s']"
#define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']"
#define SC_GTR q->results[q->results_length-1]
xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */
htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL);
if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */
SC_CWLE(c, c->queries_lock);
q->results_length = 0;
gnu_code_start;
eachNodeX(xmldoc, xpath,
lambda(void, (xmlNodePtr node, void * data),
{
if (node->type == XML_ELEMENT_NODE) {
xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
if (href) {
char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* fuck rules, I will rewrite it anyways <= hi future me */
sc_fix_url(&hreflink);
char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB));
sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */);
xmlNodePtr descnode = nthNodeXN(node, x, 0);
if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */
descnode = nthNodeXN(node, "../../div/div", 0);
xmlNodePtr breadnode = nthNodeXN(node, xbread, 0);
free(x);
free(xbread);
if (q->results_sizeof <= q->results_length)
SC_BIGGER_ARRAY(q->results, sc_result);
q->results_length++;
SC_GTR->query = q;
SC_GTR->title = (char *) xmlNodeGetContent(node->children);
if (!SC_GTR->title) {
SC_GTR->title = malloc(strlen(SC_I18N_NO_TITLE)+1);
strcpy(SC_GTR->title, SC_I18N_NO_TITLE);
}
SC_GTR->url = hreflink;
if (!SC_GTR->url) {
SC_GTR->url = malloc(strlen(SC_I18N_NO_HREFLINK)+1);
strcpy(SC_GTR->url, SC_I18N_NO_HREFLINK);
}
SC_GTR->desc = (char *) xmlNodeGetContent(descnode);
if (!SC_GTR->desc) {
SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1);
strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION);
}
SC_GTR->breadcrumbs = (char *) xmlNodeGetContent(breadnode);
if (!SC_GTR->breadcrumbs) {
SC_GTR->breadcrumbs = malloc(strlen(SC_GTR->url)+1);
strcpy(SC_GTR->breadcrumbs, SC_GTR->url);
}
}
}
}
),
NULL);
gnu_code_end;
q->cache = c;
q->lookup_time = time(NULL);
q->engines = SC_ENGINE_GOOGLE;
q->string = realloc(q->string, strlen(s)+1);
strcpy(q->string, s);
if (!qwasgiven) {
SC_CWLE(c, c->queries_lock);
if (c->queries_sizeof <= c->queries_length)
SC_BIGGER_ARRAY(c->queries, sc_query);
c->queries_length++;
#define SC_GTQ c->queries[c->queries_length-1]
SC_GTQ = q;
}
SC_CUE(c, c->queries_lock);
xmlFreeDoc(xmldoc);
rc:
free(txtdoc);
free(titleclass);
free(descclass);
free(xpath);
return (rs < 0) ? NULL : q;
}