Source code for orka.tools.search_tools

# OrKa: Orchestrator Kit Agents
# Copyright © 2025 Marco Somma
#
# This file is part of OrKa – https://github.com/marcosomma/orka-reasoning
#
# Licensed under the Apache License, Version 2.0 (Apache 2.0).
# You may not use this file for commercial purposes without explicit permission.
#
# Full license: https://www.apache.org/licenses/LICENSE-2.0
# For commercial use, contact: marcosomma.work@gmail.com
#
# Required attribution: OrKa by Marco Somma – https://github.com/marcosomma/orka-reasoning

"""
Search Tools Module
=================

This module implements web search tools for the OrKa framework.
These tools provide capabilities to search the web using various search engines.

The search tools in this module include:
- GoogleSearchTool: Searches the web using Google Custom Search API
- DuckDuckGoTool: Searches the web using DuckDuckGo search engine

These tools can be used within workflows to retrieve real-time information
from the web, enabling agents to access up-to-date knowledge that might not
be present in their training data.
"""

import logging
from typing import Any, List

# Optional imports for search engines
try:
    from duckduckgo_search import DDGS

    HAS_DUCKDUCKGO = True
    DDGS_INSTANCE: Any = DDGS
except ImportError:
    DDGS_INSTANCE = None
    HAS_DUCKDUCKGO = False

# Optional import for requests (for alternative search methods)
try:
    import requests

    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False

# Optional import for BeautifulSoup (for web scraping fallback)
try:
    from bs4 import BeautifulSoup

    HAS_BS4 = True
except ImportError:
    HAS_BS4 = False

from .base_tool import BaseTool

logger = logging.getLogger(__name__)


[docs] class DuckDuckGoTool(BaseTool): """ A tool that performs web searches using the DuckDuckGo search engine. Returns search result snippets from the top results. """
[docs] def run(self, input_data: Any) -> List[str]: """ Perform a DuckDuckGo search and return result snippets. Args: input_data (dict): Input containing search query. Returns: list: List of search result snippets. """ # Check if DuckDuckGo is available if not HAS_DUCKDUCKGO: return ["DuckDuckGo search not available - duckduckgo_search package not installed"] # Get query - prioritize formatted_prompt from orchestrator, then fallback to other sources query = "" if isinstance(input_data, dict): # First check if orchestrator has provided a formatted_prompt via payload if "formatted_prompt" in input_data: query = input_data["formatted_prompt"] # Then check if we have a prompt that was rendered by orchestrator elif hasattr(self, "formatted_prompt"): query = self.formatted_prompt # Fall back to the raw prompt (which should be rendered by orchestrator) elif hasattr(self, "prompt") and self.prompt: query = self.prompt # Finally, try to get from input data else: query = input_data.get("input") or input_data.get("query") or "" else: query = input_data if not query: return ["No query provided"] # Convert to string if needed query = str(query) # Execute real search return self._execute_search(query)
def _execute_search(self, query: str) -> List[str]: """Execute actual DuckDuckGo search with improved error handling.""" from datetime import datetime timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" try: # Initialize DDGS with timeout and retry logic import time max_retries = 2 retry_delay = 1 for attempt in range(max_retries): try: with DDGS_INSTANCE(timeout=10) as ddgs: # Try text search first try: results = list(ddgs.text(query, max_results=5)) if results: search_results = [] for r in results: if isinstance(r, dict) and "body" in r: # Clean and truncate result body = str(r["body"]).strip() if len(body) > 500: body = body[:500] + "..." if body: search_results.append(body) if search_results: logger.info( f"DuckDuckGo text search returned {len(search_results)} results" ) return [timestamp] + search_results[:5] except Exception as text_error: logger.warning( f"Text search failed on attempt {attempt + 1}: {str(text_error)}" ) # Fallback to news search try: results = list(ddgs.news(query, max_results=5)) if results: search_results = [] for r in results: if isinstance(r, dict) and "body" in r: # Clean and truncate result body = str(r["body"]).strip() if len(body) > 500: body = body[:500] + "..." if body: search_results.append(body) if search_results: logger.info( f"DuckDuckGo news search returned {len(search_results)} results" ) return [timestamp] + search_results[:5] except Exception as news_error: logger.warning( f"News search failed on attempt {attempt + 1}: {str(news_error)}" ) # If we get here, both searches returned empty results logger.warning( f"Both text and news searches returned empty results on attempt {attempt + 1}" ) except Exception as ddgs_error: logger.warning( f"DDGS initialization failed on attempt {attempt + 1}: {str(ddgs_error)}" ) if attempt < max_retries - 1: time.sleep(retry_delay) retry_delay *= 2 # Exponential backoff # All attempts failed logger.error("All DuckDuckGo search attempts failed") return [timestamp, "Search temporarily unavailable - please try again later"] except Exception as e: logger.error(f"DuckDuckGo search failed with unexpected error: {str(e)}") return [timestamp, f"Search error: {str(e)}"]
[docs] class WebSearchTool(BaseTool): """ A more robust web search tool that tries multiple search methods. Falls back through different search engines and methods. """
[docs] def run(self, input_data: Any) -> List[str]: """ Perform web search using multiple fallback methods. Args: input_data: Input containing search query. Returns: list: List of search result snippets. """ # Get query using same logic as DuckDuckGoTool query = self._extract_query(input_data) if not query: return ["No query provided"] from datetime import datetime timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" # Try search methods in order of preference search_methods = [self._duckduckgo_search, self._searx_search, self._fallback_search] for method in search_methods: try: results = method(query) if results and len(results) > 1: # More than just timestamp logger.info(f"Search successful using {method.__name__}") return results except Exception as e: logger.warning(f"Search method {method.__name__} failed: {str(e)}") continue # All methods failed return [timestamp, "All search methods unavailable - please check internet connection"]
def _extract_query(self, input_data: Any) -> str: """Extract query from input data using same logic as DuckDuckGoTool.""" query = "" if isinstance(input_data, dict): if "formatted_prompt" in input_data: query = input_data["formatted_prompt"] elif hasattr(self, "formatted_prompt"): query = self.formatted_prompt elif hasattr(self, "prompt") and self.prompt: query = self.prompt else: query = input_data.get("input") or input_data.get("query") or "" else: query = input_data return str(query) if query else "" def _duckduckgo_search(self, query: str) -> List[str]: """Try DuckDuckGo search.""" if not HAS_DUCKDUCKGO: raise Exception("DuckDuckGo not available") from datetime import datetime timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" with DDGS_INSTANCE(timeout=10) as ddgs: # Try text search results = list(ddgs.text(query, max_results=5)) if results: search_results = [] for r in results: if isinstance(r, dict) and "body" in r: body = str(r["body"]).strip() if len(body) > 500: body = body[:500] + "..." if body: search_results.append(body) if search_results: return [timestamp] + search_results[:5] raise Exception("No results from DuckDuckGo") def _searx_search(self, query: str) -> List[str]: """Try SearX public instances.""" if not HAS_REQUESTS: raise Exception("Requests library not available") from datetime import datetime timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" # Public SearX instances (these change frequently) searx_instances = ["https://searx.be", "https://search.sapti.me", "https://searx.info"] for instance in searx_instances: try: response = requests.get( f"{instance}/search", params={"q": query, "format": "json", "categories": "general"}, timeout=10, headers={"User-Agent": "OrKa-Search/1.0"}, ) if response.status_code == 200: data = response.json() results = data.get("results", []) if results: search_results = [] for r in results[:5]: content = r.get("content", "").strip() if len(content) > 500: content = content[:500] + "..." if content: search_results.append(content) if search_results: return [timestamp] + search_results except Exception as e: logger.debug(f"SearX instance {instance} failed: {str(e)}") continue raise Exception("No working SearX instances") def _fallback_search(self, query: str) -> List[str]: """Fallback search using simple web scraping.""" if not (HAS_REQUESTS and HAS_BS4): raise Exception("Required libraries not available for fallback search") from datetime import datetime timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" # This is a very basic fallback - in production you'd want more sophisticated methods try: # Try a simple search on a public site (this is just an example) response = requests.get( "https://html.duckduckgo.com/html/", params={"q": query}, timeout=10, headers={"User-Agent": "Mozilla/5.0 (compatible; OrKa-Search/1.0)"}, ) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") results = soup.find_all("a", class_="result__snippet") if results: search_results = [] for r in results[:3]: text = r.get_text().strip() if len(text) > 300: text = text[:300] + "..." if text: search_results.append(text) if search_results: return [timestamp] + search_results except Exception as e: logger.debug(f"Fallback search failed: {str(e)}") raise Exception("Fallback search failed")
[docs] class SimpleSearchTool(BaseTool): """ A simple search tool that provides basic information without external APIs. Useful as a last resort when all other search methods fail. """
[docs] def run(self, input_data: Any) -> List[str]: """ Provide basic search information without external APIs. Args: input_data: Input containing search query. Returns: list: List with timestamp and basic information. """ # Get query if isinstance(input_data, dict): query = ( input_data.get("formatted_prompt") or input_data.get("input") or input_data.get("query") or "" ) else: query = str(input_data) if input_data else "" from datetime import datetime timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" if not query: return [timestamp, "No search query provided"] # Provide basic information based on common query patterns query_lower = query.lower() if any(word in query_lower for word in ["weather", "temperature", "climate"]): return [ timestamp, "For current weather information, please check a weather service like weather.com or your local weather app.", "Weather data requires real-time APIs that are not available in this search tool.", ] elif any(word in query_lower for word in ["news", "latest", "recent", "today"]): return [ timestamp, "For latest news, please visit news websites like BBC, Reuters, or AP News.", "Real-time news requires access to news APIs or RSS feeds.", ] elif any(word in query_lower for word in ["stock", "price", "market", "trading"]): return [ timestamp, "For financial information, please check financial websites like Yahoo Finance or Bloomberg.", "Stock prices and market data require real-time financial APIs.", ] else: return [ timestamp, f"Search query received: '{query}'", "External search services are currently unavailable. Please try again later or use a web browser for searching.", ]