Source code for orka.tools.search_tools

# OrKa: Orchestrator Kit Agents
# Copyright © 2025 Marco Somma
#
# This file is part of OrKa – https://github.com/marcosomma/orka-reasoning
#
# Licensed under the Apache License, Version 2.0 (Apache 2.0).
# You may not use this file for commercial purposes without explicit permission.
#
# Full license: https://www.apache.org/licenses/LICENSE-2.0
# For commercial use, contact: marcosomma.work@gmail.com
#
# Required attribution: OrKa by Marco Somma – https://github.com/marcosomma/orka-reasoning

"""
Search Tools Module
=================

This module implements web search tools for the OrKa framework.
These tools provide capabilities to search the web using various search engines.

The search tools in this module include:
- GoogleSearchTool: Searches the web using Google Custom Search API
- DuckDuckGoTool: Searches the web using DuckDuckGo search engine

These tools can be used within workflows to retrieve real-time information
from the web, enabling agents to access up-to-date knowledge that might not
be present in their training data.
"""

import logging
from typing import Any, List

# Optional imports for search engines
try:
    from duckduckgo_search import DDGS

    HAS_DUCKDUCKGO = True
    DDGS_INSTANCE: Any = DDGS
except ImportError:
    DDGS_INSTANCE = None
    HAS_DUCKDUCKGO = False

# Optional import for requests (for alternative search methods)
try:
    import requests

    HAS_REQUESTS = True
except ImportError:
    HAS_REQUESTS = False

# Optional import for BeautifulSoup (for web scraping fallback)
try:
    from bs4 import BeautifulSoup

    HAS_BS4 = True
except ImportError:
    HAS_BS4 = False

from .base_tool import BaseTool

logger = logging.getLogger(__name__)



[docs]
class DuckDuckGoTool(BaseTool):
    """
    A tool that performs web searches using the DuckDuckGo search engine.
    Returns search result snippets from the top results.
    """


[docs]
    def run(self, input_data: Any) -> List[str]:
        """
        Perform a DuckDuckGo search and return result snippets.

        Args:
            input_data (dict): Input containing search query.

        Returns:
            list: List of search result snippets.
        """
        # Check if DuckDuckGo is available
        if not HAS_DUCKDUCKGO:
            return ["DuckDuckGo search not available - duckduckgo_search package not installed"]

        # Get query - prioritize formatted_prompt from orchestrator, then fallback to other sources
        query = ""

        if isinstance(input_data, dict):
            # First check if orchestrator has provided a formatted_prompt via payload
            if "formatted_prompt" in input_data:
                query = input_data["formatted_prompt"]
            # Then check if we have a prompt that was rendered by orchestrator
            elif hasattr(self, "formatted_prompt"):
                query = self.formatted_prompt
            # Fall back to the raw prompt (which should be rendered by orchestrator)
            elif hasattr(self, "prompt") and self.prompt:
                query = self.prompt
            # Finally, try to get from input data
            else:
                query = input_data.get("input") or input_data.get("query") or ""
        else:
            query = input_data

        if not query:
            return ["No query provided"]

        # Convert to string if needed
        query = str(query)

        # Execute real search
        return self._execute_search(query)


    def _execute_search(self, query: str) -> List[str]:
        """Execute actual DuckDuckGo search with improved error handling."""
        from datetime import datetime

        timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

        try:
            # Initialize DDGS with timeout and retry logic
            import time

            max_retries = 2
            retry_delay = 1

            for attempt in range(max_retries):
                try:
                    with DDGS_INSTANCE(timeout=10) as ddgs:
                        # Try text search first
                        try:
                            results = list(ddgs.text(query, max_results=5))
                            if results:
                                search_results = []
                                for r in results:
                                    if isinstance(r, dict) and "body" in r:
                                        # Clean and truncate result
                                        body = str(r["body"]).strip()
                                        if len(body) > 500:
                                            body = body[:500] + "..."
                                        if body:
                                            search_results.append(body)

                                if search_results:
                                    logger.info(
                                        f"DuckDuckGo text search returned {len(search_results)} results"
                                    )
                                    return [timestamp] + search_results[:5]

                        except Exception as text_error:
                            logger.warning(
                                f"Text search failed on attempt {attempt + 1}: {str(text_error)}"
                            )

                        # Fallback to news search
                        try:
                            results = list(ddgs.news(query, max_results=5))
                            if results:
                                search_results = []
                                for r in results:
                                    if isinstance(r, dict) and "body" in r:
                                        # Clean and truncate result
                                        body = str(r["body"]).strip()
                                        if len(body) > 500:
                                            body = body[:500] + "..."
                                        if body:
                                            search_results.append(body)

                                if search_results:
                                    logger.info(
                                        f"DuckDuckGo news search returned {len(search_results)} results"
                                    )
                                    return [timestamp] + search_results[:5]

                        except Exception as news_error:
                            logger.warning(
                                f"News search failed on attempt {attempt + 1}: {str(news_error)}"
                            )

                        # If we get here, both searches returned empty results
                        logger.warning(
                            f"Both text and news searches returned empty results on attempt {attempt + 1}"
                        )

                except Exception as ddgs_error:
                    logger.warning(
                        f"DDGS initialization failed on attempt {attempt + 1}: {str(ddgs_error)}"
                    )
                    if attempt < max_retries - 1:
                        time.sleep(retry_delay)
                        retry_delay *= 2  # Exponential backoff

            # All attempts failed
            logger.error("All DuckDuckGo search attempts failed")
            return [timestamp, "Search temporarily unavailable - please try again later"]

        except Exception as e:
            logger.error(f"DuckDuckGo search failed with unexpected error: {str(e)}")
            return [timestamp, f"Search error: {str(e)}"]




[docs]
class WebSearchTool(BaseTool):
    """
    A more robust web search tool that tries multiple search methods.
    Falls back through different search engines and methods.
    """


[docs]
    def run(self, input_data: Any) -> List[str]:
        """
        Perform web search using multiple fallback methods.

        Args:
            input_data: Input containing search query.

        Returns:
            list: List of search result snippets.
        """
        # Get query using same logic as DuckDuckGoTool
        query = self._extract_query(input_data)
        if not query:
            return ["No query provided"]

        from datetime import datetime

        timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

        # Try search methods in order of preference
        search_methods = [self._duckduckgo_search, self._searx_search, self._fallback_search]

        for method in search_methods:
            try:
                results = method(query)
                if results and len(results) > 1:  # More than just timestamp
                    logger.info(f"Search successful using {method.__name__}")
                    return results
            except Exception as e:
                logger.warning(f"Search method {method.__name__} failed: {str(e)}")
                continue

        # All methods failed
        return [timestamp, "All search methods unavailable - please check internet connection"]


    def _extract_query(self, input_data: Any) -> str:
        """Extract query from input data using same logic as DuckDuckGoTool."""
        query = ""

        if isinstance(input_data, dict):
            if "formatted_prompt" in input_data:
                query = input_data["formatted_prompt"]
            elif hasattr(self, "formatted_prompt"):
                query = self.formatted_prompt
            elif hasattr(self, "prompt") and self.prompt:
                query = self.prompt
            else:
                query = input_data.get("input") or input_data.get("query") or ""
        else:
            query = input_data

        return str(query) if query else ""

    def _duckduckgo_search(self, query: str) -> List[str]:
        """Try DuckDuckGo search."""
        if not HAS_DUCKDUCKGO:
            raise Exception("DuckDuckGo not available")

        from datetime import datetime

        timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

        with DDGS_INSTANCE(timeout=10) as ddgs:
            # Try text search
            results = list(ddgs.text(query, max_results=5))
            if results:
                search_results = []
                for r in results:
                    if isinstance(r, dict) and "body" in r:
                        body = str(r["body"]).strip()
                        if len(body) > 500:
                            body = body[:500] + "..."
                        if body:
                            search_results.append(body)

                if search_results:
                    return [timestamp] + search_results[:5]

        raise Exception("No results from DuckDuckGo")

    def _searx_search(self, query: str) -> List[str]:
        """Try SearX public instances."""
        if not HAS_REQUESTS:
            raise Exception("Requests library not available")

        from datetime import datetime

        timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

        # Public SearX instances (these change frequently)
        searx_instances = ["https://searx.be", "https://search.sapti.me", "https://searx.info"]

        for instance in searx_instances:
            try:
                response = requests.get(
                    f"{instance}/search",
                    params={"q": query, "format": "json", "categories": "general"},
                    timeout=10,
                    headers={"User-Agent": "OrKa-Search/1.0"},
                )

                if response.status_code == 200:
                    data = response.json()
                    results = data.get("results", [])

                    if results:
                        search_results = []
                        for r in results[:5]:
                            content = r.get("content", "").strip()
                            if len(content) > 500:
                                content = content[:500] + "..."
                            if content:
                                search_results.append(content)

                        if search_results:
                            return [timestamp] + search_results

            except Exception as e:
                logger.debug(f"SearX instance {instance} failed: {str(e)}")
                continue

        raise Exception("No working SearX instances")

    def _fallback_search(self, query: str) -> List[str]:
        """Fallback search using simple web scraping."""
        if not (HAS_REQUESTS and HAS_BS4):
            raise Exception("Required libraries not available for fallback search")

        from datetime import datetime

        timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

        # This is a very basic fallback - in production you'd want more sophisticated methods
        try:
            # Try a simple search on a public site (this is just an example)
            response = requests.get(
                "https://html.duckduckgo.com/html/",
                params={"q": query},
                timeout=10,
                headers={"User-Agent": "Mozilla/5.0 (compatible; OrKa-Search/1.0)"},
            )

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, "html.parser")
                results = soup.find_all("a", class_="result__snippet")

                if results:
                    search_results = []
                    for r in results[:3]:
                        text = r.get_text().strip()
                        if len(text) > 300:
                            text = text[:300] + "..."
                        if text:
                            search_results.append(text)

                    if search_results:
                        return [timestamp] + search_results

        except Exception as e:
            logger.debug(f"Fallback search failed: {str(e)}")

        raise Exception("Fallback search failed")




[docs]
class SimpleSearchTool(BaseTool):
    """
    A simple search tool that provides basic information without external APIs.
    Useful as a last resort when all other search methods fail.
    """


[docs]
    def run(self, input_data: Any) -> List[str]:
        """
        Provide basic search information without external APIs.

        Args:
            input_data: Input containing search query.

        Returns:
            list: List with timestamp and basic information.
        """
        # Get query
        if isinstance(input_data, dict):
            query = (
                input_data.get("formatted_prompt")
                or input_data.get("input")
                or input_data.get("query")
                or ""
            )
        else:
            query = str(input_data) if input_data else ""

        from datetime import datetime

        timestamp = f"Current date and time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"

        if not query:
            return [timestamp, "No search query provided"]

        # Provide basic information based on common query patterns
        query_lower = query.lower()

        if any(word in query_lower for word in ["weather", "temperature", "climate"]):
            return [
                timestamp,
                "For current weather information, please check a weather service like weather.com or your local weather app.",
                "Weather data requires real-time APIs that are not available in this search tool.",
            ]
        elif any(word in query_lower for word in ["news", "latest", "recent", "today"]):
            return [
                timestamp,
                "For latest news, please visit news websites like BBC, Reuters, or AP News.",
                "Real-time news requires access to news APIs or RSS feeds.",
            ]
        elif any(word in query_lower for word in ["stock", "price", "market", "trading"]):
            return [
                timestamp,
                "For financial information, please check financial websites like Yahoo Finance or Bloomberg.",
                "Stock prices and market data require real-time financial APIs.",
            ]
        else:
            return [
                timestamp,
                f"Search query received: '{query}'",
                "External search services are currently unavailable. Please try again later or use a web browser for searching.",
            ]