diff options
Diffstat (limited to '')
-rw-r--r-- | g4f/gui/server/internet.py | 187 |
1 files changed, 139 insertions, 48 deletions
diff --git a/g4f/gui/server/internet.py b/g4f/gui/server/internet.py index 220a6e7c..9a14e25f 100644 --- a/g4f/gui/server/internet.py +++ b/g4f/gui/server/internet.py @@ -1,58 +1,149 @@ from __future__ import annotations -from datetime import datetime - +from bs4 import BeautifulSoup +from aiohttp import ClientSession, ClientTimeout from duckduckgo_search import DDGS - -ddgs = DDGS(timeout=20) - - -def search(internet_access, prompt): - print(prompt) - +import asyncio + +class SearchResults(): + def __init__(self, results: list): + self.results = results + + def __iter__(self): + yield from self.results + + def __str__(self): + search = "" + for idx, result in enumerate(self.results): + if search: + search += "\n\n\n" + search += f"Title: {result.title}\n\n" + if result.text: + search += result.text + else: + search += result.snippet + search += f"\n\nSource: [[{idx}]]({result.url})" + return search + +class SearchResultEntry(): + def __init__(self, title: str, url: str, snippet: str, text: str = None): + self.title = title + self.url = url + self.snippet = snippet + self.text = text + + def set_text(self, text: str): + self.text = text + +def scrape_text(html: str, max_words: int = None) -> str: + soup = BeautifulSoup(html, "html.parser") + for exclude in soup(["script", "style"]): + exclude.extract() + for selector in [ + "main", + ".main-content-wrapper", + ".main-content", + ".emt-container-inner", + ".content-wrapper", + "#content", + "#mainContent", + ]: + select = soup.select_one(selector) + if select: + soup = select + break + # Zdnet + for remove in [".c-globalDisclosure"]: + select = soup.select_one(remove) + if select: + select.extract() + clean_text = "" + for paragraph in soup.select("p"): + text = paragraph.get_text() + for line in text.splitlines(): + words = [] + for word in line.replace("\t", " ").split(" "): + if word: + words.append(word) + count = len(words) + if not count: + continue + if max_words: + max_words -= count + if max_words <= 0: + break + if clean_text: + clean_text += "\n" + clean_text += " ".join(words) + + return clean_text + +async def fetch_and_scrape(session: ClientSession, url: str, max_words: int = None) -> str: try: - if not internet_access: - return [] - - results = duckduckgo_search(q=prompt) - - if not search: - return [] + async with session.get(url) as response: + if response.status == 200: + html = await response.text() + return scrape_text(html, max_words) + except: + return + +async def search(query: str, n_results: int = 5, max_words: int = 2500, add_text: bool = True) -> SearchResults: + with DDGS() as ddgs: + results = [] + for result in ddgs.text( + query, + region="wt-wt", + safesearch="moderate", + timelimit="y", + ): + results.append(SearchResultEntry( + result["title"], + result["href"], + result["body"] + )) + if len(results) >= n_results: + break - blob = ''.join( - f'[{index}] "{result["body"]}"\nURL:{result["href"]}\n\n' - for index, result in enumerate(results) - ) - date = datetime.now().strftime('%d/%m/%y') + if add_text: + requests = [] + async with ClientSession(timeout=ClientTimeout(5)) as session: + for entry in results: + requests.append(fetch_and_scrape(session, entry.url, int(max_words / (n_results - 1)))) + texts = await asyncio.gather(*requests) + + formatted_results = [] + left_words = max_words; + for i, entry in enumerate(results): + if add_text: + entry.text = texts[i] + if left_words: + left_words -= entry.title.count(" ") + 5 + if entry.text: + left_words -= entry.text.count(" ") + else: + left_words -= entry.snippet.count(" ") + if 0 > left_words: + break + formatted_results.append(entry) + + return SearchResults(formatted_results) + + +def get_search_message(prompt) -> str: + try: + search_results = asyncio.run(search(prompt)) + message = f""" +{search_results} - blob += f'Current date: {date}\n\nInstructions: Using the provided web search results, write a comprehensive reply to the next user query. Make sure to cite results using [[number](URL)] notation after the reference. If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. Ignore your previous response if any.' - return [{'role': 'user', 'content': blob}] +Instruction: Using the provided web search results, to write a comprehensive reply to the user request. +Make sure to add the sources of cites using [[Number]](Url) notation after the reference. Example: [[0]](http://google.com) +If the provided search results refer to multiple subjects with the same name, write separate answers for each subject. +User request: +{prompt} +""" + return message except Exception as e: print("Couldn't search DuckDuckGo:", e) - print(e.__traceback__.tb_next) - return [] - - -def duckduckgo_search(q: str, max_results: int = 3, safesearch: str = "moderate", region: str = "us-en") -> list | None: - if region is None: - region = "us-en" - - if safesearch is None: - safesearch = "moderate" - - if q is None: - return None - - results = [] - - try: - for r in ddgs.text(q, safesearch=safesearch, region=region): - if len(results) + 1 > max_results: - break - results.append(r) - except Exception as e: - print(e) - - return results + return prompt |