From 0d59789eedf3784cf4c3aaf764785a4ad91723c4 Mon Sep 17 00:00:00 2001
From: Heiner Lohaus <hlohaus@users.noreply.github.com>
Date: Wed, 1 Jan 2025 14:01:33 +0100
Subject: Add File API Documentation for Python and JS Format Bucket
 Placeholder in GUI

---
 g4f/tools/web_search.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'g4f/tools/web_search.py')

diff --git a/g4f/tools/web_search.py b/g4f/tools/web_search.py
index 9033e0ad..780e45df 100644
--- a/g4f/tools/web_search.py
+++ b/g4f/tools/web_search.py
@@ -4,7 +4,10 @@ from aiohttp import ClientSession, ClientTimeout, ClientError
 import json
 import hashlib
 from pathlib import Path
-from collections import Counter
+from urllib.parse import urlparse
+import datetime
+import asyncio
+
 try:
     from duckduckgo_search import DDGS
     from duckduckgo_search.exceptions import DuckDuckGoSearchException
@@ -17,13 +20,12 @@ try:
     has_spacy = True
 except:
     has_spacy = False
+
 from typing import Iterator
 from ..cookies import get_cookies_dir
 from ..errors import MissingRequirementsError
 from .. import debug
 
-import asyncio
-
 DEFAULT_INSTRUCTIONS = """
 Using the provided web search results, to write a comprehensive reply to the user request.
 Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com)
@@ -64,7 +66,8 @@ class SearchResultEntry():
         self.text = text
 
 def scrape_text(html: str, max_words: int = None) -> Iterator[str]:
-    soup = BeautifulSoup(html, "html.parser")
+    source = BeautifulSoup(html, "html.parser")
+    soup = source
     for selector in [
             "main",
             ".main-content-wrapper",
@@ -96,12 +99,18 @@ def scrape_text(html: str, max_words: int = None) -> Iterator[str]:
                     break
             yield " ".join(words) + "\n"
 
+    canonical_link = source.find("link", rel="canonical")
+    if canonical_link and "href" in canonical_link.attrs:
+        link = canonical_link["href"]
+        domain = urlparse(link).netloc
+        yield f"\nSource: [{domain}]({link})"
+
 async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str:
     try:
         bucket_dir: Path = Path(get_cookies_dir()) / ".scrape_cache" / "fetch_and_scrape"
         bucket_dir.mkdir(parents=True, exist_ok=True)
         md5_hash = hashlib.md5(url.encode()).hexdigest()
-        cache_file = bucket_dir / f"{url.split('/')[3]}.{md5_hash}.txt"
+        cache_file = bucket_dir / f"{url.split('/')[3]}.{datetime.date.today()}.{md5_hash}.txt"
         if cache_file.exists():
             return cache_file.read_text()
         async with session.get(url) as response:
-- 
cgit v1.2.3