From 12c413fd2e89b1d3a4d0f9e48fb6d12d0556c7ba Mon Sep 17 00:00:00 2001 From: Heiner Lohaus Date: Sun, 5 Jan 2025 17:02:15 +0100 Subject: Add Edge as Browser for nodriver Fix for RetryProviders doesn't retry Add retry and continue for DuckDuckGo provider Add cache for Cloudflare provider Add cache for prompts on gui home Add scroll to bottom checkbox in gui Improve prompts on home gui Fix response content type in api for files --- g4f/tools/web_search.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) (limited to 'g4f/tools/web_search.py') diff --git a/g4f/tools/web_search.py b/g4f/tools/web_search.py index 14ff6e42..0464a15c 100644 --- a/g4f/tools/web_search.py +++ b/g4f/tools/web_search.py @@ -24,6 +24,7 @@ except: from typing import Iterator from ..cookies import get_cookies_dir +from ..providers.response import format_link from ..errors import MissingRequirementsError from .. import debug @@ -66,7 +67,7 @@ class SearchResultEntry(): def set_text(self, text: str): self.text = text -def scrape_text(html: str, max_words: int = None, add_source=True) -> Iterator[str]: +def scrape_text(html: str, max_words: int = None, add_source=True, count_images: int = 2) -> Iterator[str]: source = BeautifulSoup(html, "html.parser") soup = source for selector in [ @@ -88,7 +89,20 @@ def scrape_text(html: str, max_words: int = None, add_source=True) -> Iterator[s if select: select.extract() - for paragraph in soup.select("p, table:not(:has(p)), ul:not(:has(p)), h1, h2, h3, h4, h5, h6"): + image_select = "img[alt][src^=http]:not([alt=''])" + image_link_select = f"a:has({image_select})" + for paragraph in soup.select(f"h1, h2, h3, h4, h5, h6, p, table:not(:has(p)), ul:not(:has(p)), {image_link_select}"): + image = paragraph.select_one(image_select) + if count_images > 0: + if image: + title = paragraph.get("title") or paragraph.text + if title: + yield f"!{format_link(image['src'], title)}\n" + if max_words is not None: + max_words -= 10 + count_images -= 1 + continue + for line in paragraph.text.splitlines(): words = [word for word in line.replace("\t", " ").split(" ") if word] count = len(words) @@ -112,7 +126,7 @@ async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = No bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape" bucket_dir.mkdir(parents=True, exist_ok=True) md5_hash = hashlib.md5(url.encode()).hexdigest() - cache_file = bucket_dir / f"{url.split('?')[0].split('//')[1].replace('/', '+')[:16]}.{datetime.date.today()}.{md5_hash}.txt" + cache_file = bucket_dir / f"{url.split('?')[0].split('//')[1].replace('/', '+')[:16]}.{datetime.date.today()}.{md5_hash}.cache" if cache_file.exists(): return cache_file.read_text() async with session.get(url) as response: @@ -179,14 +193,15 @@ async def do_search(prompt: str, query: str = None, instructions: str = DEFAULT_ md5_hash = hashlib.md5(json_bytes).hexdigest() bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / f"web_search" / f"{datetime.date.today()}" bucket_dir.mkdir(parents=True, exist_ok=True) - cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.txt" + cache_file = bucket_dir / f"{quote_plus(query[:20])}.{md5_hash}.cache" if cache_file.exists(): with cache_file.open("r") as f: search_results = f.read() else: search_results = await search(query, **kwargs) - with cache_file.open("w") as f: - f.write(str(search_results)) + if search_results.results: + with cache_file.open("w") as f: + f.write(str(search_results)) new_prompt = f""" {search_results} -- cgit v1.2.3