From 12c413fd2e89b1d3a4d0f9e48fb6d12d0556c7ba Mon Sep 17 00:00:00 2001
From: Heiner Lohaus <hlohaus@users.noreply.github.com>
Date: Sun, 5 Jan 2025 17:02:15 +0100
Subject: Add Edge as Browser for nodriver Fix for RetryProviders doesn't retry
 Add retry and continue for DuckDuckGo provider Add cache for Cloudflare
 provider Add cache for prompts on gui home Add scroll to bottom checkbox in
 gui Improve prompts on home gui Fix response content type in api for files

---
 g4f/tools/web_search.py | 27 +++++++++++++++++++++------
 1 file changed, 21 insertions(+), 6 deletions(-)

(limited to 'g4f/tools/web_search.py')

diff --git a/g4f/tools/web_search.py b/g4f/tools/web_search.py
index 14ff6e42..0464a15c 100644
--- a/g4f/tools/web_search.py
+++ b/g4f/tools/web_search.py
@@ -24,6 +24,7 @@ except:
 
 from typing import Iterator
 from ..cookies import get_cookies_dir
+from ..providers.response import format_link
 from ..errors import MissingRequirementsError
 from .. import debug
 
@@ -66,7 +67,7 @@ class SearchResultEntry():
     def set_text(self, text: str):
         self.text = text
 
-def scrape_text(html: str, max_words: int = None, add_source=True) -> Iterator[str]:
+def scrape_text(html: str, max_words: int = None, add_source=True, count_images: int = 2) -> Iterator[str]:
     source = BeautifulSoup(html, "html.parser")
     soup = source
     for selector in [
@@ -88,7 +89,20 @@ def scrape_text(html: str, max_words: int = None, add_source=True) -> Iterator[s
         if select:
             select.extract()
 
-    for paragraph in soup.select("p, table:not(:has(p)), ul:not(:has(p)), h1, h2, h3, h4, h5, h6"):
+    image_select = "img[alt][src^=http]:not([alt=''])"
+    image_link_select = f"a:has({image_select})"
+    for paragraph in soup.select(f"h1, h2, h3, h4, h5, h6, p, table:not(:has(p)), ul:not(:has(p)), {image_link_select}"):
+        image = paragraph.select_one(image_select)
+        if count_images > 0:
+            if image:
+                title = paragraph.get("title") or paragraph.text
+                if title:
+                    yield f"!{format_link(image['src'], title)}\n"
+                    if max_words is not None:
+                        max_words -= 10
+                    count_images -= 1
+                continue
+
         for line in paragraph.text.splitlines():
             words = [word for word in line.replace("\t", " ").split(" ") if word]
             count = len(words)
@@ -112,7 +126,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = No
         bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
         bucket_dir.mkdir(parents=True, exist_ok=True)
         md5_hash = hashlib.md5(url.encode()).hexdigest()
-        cache_file = bucket_dir / f"{url.split('?')[0].split('//')[1].replace('/', '+')[:16]}.{datetime.date.today()}.{md5_hash}.txt"
+        cache_file = bucket_dir / f"{url.split('?')[0].split('//')[1].replace('/', '+')[:16]}.{datetime.date.today()}.{md5_hash}.cache"
         if cache_file.exists():
             return cache_file.read_text()
         async with session.get(url) as response:
@@ -179,14 +193,15 @@ async def do_search(prompt: str, query: str = None, instructions: str = DEFAULT_
     md5_hash = hashlib.md5(json_bytes).hexdigest()
     bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / f"web_search" / f"{datetime.date.today()}"
     bucket_dir.mkdir(parents=True, exist_ok=True)
-    cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.txt"
+    cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.cache"
     if cache_file.exists():
         with cache_file.open("r") as f:
             search_results = f.read()
     else:
         search_results = await search(query, **kwargs)
-        with cache_file.open("w") as f:
-            f.write(str(search_results))
+        if search_results.results:
+            with cache_file.open("w") as f:
+                f.write(str(search_results))
 
     new_prompt = f"""
 {search_results}
-- 
cgit v1.2.3