summaryrefslogtreecommitdiffstats
path: root/src/api.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/api.c103
1 files changed, 88 insertions, 15 deletions
diff --git a/src/api.c b/src/api.c
index ae8d619..6ad996e 100644
--- a/src/api.c
+++ b/src/api.c
@@ -107,7 +107,9 @@ char * sc_find_class (char * haystack, const char * definition) { /* you must fr
toreturn[endofclass-class] = '\0';
return toreturn;
}
-int sc_query_google (char * s, struct sc_cache * c) {
+struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */
+ /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
+ /* if query is not NULL, it MUST be initialized */
/*
remarks:
* we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website
@@ -117,7 +119,7 @@ int sc_query_google (char * s, struct sc_cache * c) {
+ A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link.
- result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link.
+ extract those two classes and find the one that is only present on SPAN text elements.
- - result description: once we have the result div, the description is the // span with the appropriate class
+ - result description: once we have the result div, the description is the //table//span with the appropriate class
+ the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements.
- result div: to get the result div, we need the parent of the parent of the A link of the title.
* result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP.
@@ -129,35 +131,106 @@ int sc_query_google (char * s, struct sc_cache * c) {
based http request-response based user interface so we can ask the user to complete the captcha. this is not yet
implemeted and will be hard work.
*/
- if (!s || !c)
- return -1;
- int rs = 1;
+ int rs;
+ if (!s || !c) {
+ rs = -1;
+ goto rc;
+ }
+ int qwasgiven = 0;
+ if (!q)
+ q = sc_query_init();
+ else
+ qwasgiven++;
char * us = malloc(sizeof(char)*strlen(s)*3+1);
urlencode(us, s);
+ char * xpath = NULL;
+ char * descclass = NULL;
+ char * titleclass = NULL;
char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us);
// fprintf(stdout, "%s\n", txtdoc);
free(us);
if (!txtdoc) {
+ SC_LOG(SC_LOG_ERROR, c, "!txtdoc");
rs = -2;
goto rc;
}
- char * titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
- if (!titleclass) {
- SC_LOG(SC_LOG_ERROR, c, "!titleclass");
+ titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
+ descclass = sc_find_class(txtdoc, "{word-break:break-word}");
+ if (!titleclass || !descclass) {
+ SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass");
rs = -3;
goto rc;
}
-#define SC_GTXF "/html/body//a[contains(@class, '%s')]" // @class='fuLhoc ZWRArf'"
- char * xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
+#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */
+#define SC_GTXD "../..//table//span[@class='%s']"
+#define SC_GTR q->results[q->results_length-1]
+ xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */
- fprintf(stdout, "%s\n", xpath);
htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL);
- xmlXPathObjectPtr nodes = findNodes(xmldoc, xpath);
- eachNode(nodes, printLinkNode, NULL);
-rc:
+ if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */
+ SC_CWLE(c, c->queries_lock);
+ q->results_length = 0;
+ gnu_code_start;
+ eachNodeX(xmldoc, xpath,
+ lambda(void, (xmlNodePtr node, void * data),
+ {
+ if (node->type == XML_ELEMENT_NODE) {
+ xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
+ if (href) {
+ char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href");
+ if (!strncmp(hreflink, "/url?q=", strlen("/url?q="))) {
+ hreflink = hreflink+strlen("/url?q=");
+ *strchrnul(hreflink, '&') = '\0';
+ urldecode(hreflink, hreflink);
+ }
+ char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
+ sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
+ xmlNodePtr descnode = nthNodeXN(node, x, 0);
+ free(x);
+ if (q->results_sizeof <= q->results_length)
+ SC_BIGGER_ARRAY(q->results, sc_result);
+ q->results_length++;
+ SC_GTR->query = q;
+ SC_GTR->title = (char *) xmlNodeGetContent(node->children);
+ if (!SC_GTR->title) {
+ SC_GTR->title = malloc(strlen(SC_I18N_NO_TITLE)+1);
+ strcpy(SC_GTR->title, SC_I18N_NO_TITLE);
+ }
+ SC_GTR->url = hreflink;
+ if (!SC_GTR->url) {
+ SC_GTR->url = malloc(strlen(SC_I18N_NO_HREFLINK)+1);
+ strcpy(SC_GTR->url, SC_I18N_NO_HREFLINK);
+ }
+ SC_GTR->desc = (char *) xmlNodeGetContent(descnode);
+ if (!SC_GTR->desc) {
+ SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1);
+ strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION);
+ }
+ }
+ }
+ }
+ ),
+ NULL);
+ gnu_code_end;
+ q->cache = c;
+ q->lookup_time = time(NULL);
+ q->engines = SC_ENGINE_GOOGLE;
+ q->string = realloc(q->string, strlen(s)+1);
+ strcpy(q->string, s);
+ if (!qwasgiven) {
+ SC_CWLE(c, c->queries_lock);
+ if (c->queries_sizeof <= c->queries_length)
+ SC_BIGGER_ARRAY(c->queries, sc_query);
+ c->queries_length++;
+#define SC_GTQ c->queries[c->queries_length-1]
+ SC_GTQ = q;
+ }
+ SC_CUE(c, c->queries_lock);
xmlFreeDoc(xmldoc);
+rc:
free(txtdoc);
free(titleclass);
+ free(descclass);
free(xpath);
- return rs;
+ return (rs < 0) ? NULL : q;
}