Source code for search_engine_parser.core.base

"""@desc
		Base class inherited by every search engine
"""

import asyncio
import random
from abc import ABCMeta, abstractmethod
from contextlib import suppress
from enum import Enum, unique
from urllib.parse import urlencode, urlparse

import aiohttp
from bs4 import BeautifulSoup

from search_engine_parser.core import utils
from search_engine_parser.core.exceptions import NoResultsOrTrafficError


[docs]@unique class ReturnType(Enum): FULL = "full" TITLE = "titles" DESCRIPTION = "descriptions" LINK = "links"
# All results returned are each items of search
[docs]class SearchItem(dict): """ SearchItem is a dict of results containing keys (titles, descriptions, links and other additional keys dependending on the engine) >>> result <search_engine_parser.core.base.SearchItem object at 0x7f907426a280> >>> result["description"] Some description >>> result["descriptions"] Same description """ def __getitem__(self, value): """ Allow getting by index and by type ('descriptions', 'links'...)""" try: return super().__getitem__(value) except KeyError: pass if not value.endswith('s'): value += 's' return super().__getitem__(value)
[docs]class SearchResult(): """ The SearchResults after the searching >>> results = gsearch.search("preaching the choir", 1) >>> results <search_engine_parser.core.base.SearchResult object at 0x7f907426a280> The object supports retreiving individual results by iteration of just by type >>> results[0] # Returns the first result <SearchItem> >>> results["descriptions"] # Returns a list of all descriptions from all results It can be iterated like a normal list to return individual SearchItem """ def __init__(self): self.results = []
[docs] def append(self, value): self.results.append(value)
def __getitem__(self, value): """ Allow getting by index and by type ('descriptions', 'links'...)""" if isinstance(value, int): return self.results[value] l = [] for x in self.results: with suppress(KeyError): l.append(x[value]) return l
[docs] def keys(self): keys = {} with suppress(IndexError): x = self.results[0] keys = x.keys() return keys
def __len__(self): return len(self.results) def __repr_(self): return "<SearchResult: {} results>".format(len(self.results))
[docs]class BaseSearch: __metaclass__ = ABCMeta """ Search base to be extended by search parsers Every subclass must have two methods `search` amd `parse_single_result` """ # Summary of engine summary = None # Search Engine Name name = None # Search Engine unformatted URL search_url = None # The url after all query params have been set _parsed_url = None # boolean that indicates cache hit or miss _cache_hit = False
[docs] @abstractmethod def parse_soup(self, soup): """ Defines the results contained in a soup """ raise NotImplementedError("subclasses must define method <parse_soup>")
[docs] @abstractmethod def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs): """ Every div/span containing a result is passed here to retrieve `title`, `link` and `descr` """ raise NotImplementedError( "subclasses must define method <parse_results>")
[docs] def get_cache_handler(self): """ Return Cache Handler to use""" return utils.CacheHandler()
@property def cache_handler(self): return self.get_cache_handler()
[docs] def parse_result(self, results, **kwargs): """ Runs every entry on the page through parse_single_result :param results: Result of main search to extract individual results :type results: list[`bs4.element.ResultSet`] :returns: dictionary. Containing lists of titles, links, descriptions and other possible\ returns. :rtype: dict """ search_results = SearchResult() for each in results: rdict = self.parse_single_result(each, **kwargs) if rdict: search_results.append(rdict) return search_results
[docs] def get_params(self, query=None, page=None, offset=None, **kwargs): """ This function should be overwritten to return a dictionary of query params""" return {'q': query, 'page': page}
[docs] def headers(self): headers = { "Cache-Control": 'no-cache', "Connection": "keep-alive", "User-Agent": utils.get_rand_user_agent() } return headers
[docs] def clear_cache(self, all_cache=False): """ Triggers the clear cache function for a particular engine :param all_cache: if True, deletes for all engines """ if all_cache: return self.cache_handler.clear() return self.cache_handler.clear(self.name)
[docs] async def get_source(self, url, cache=True): """ Returns the source code of a webpage. Also sets the _cache_hit if cache was used :rtype: string :param url: URL to pull it's source code :return: html source code of a given URL. """ try: html, cache_hit = await self.cache_handler.get_source(self.name, url, self.headers(), cache) except Exception as exc: raise Exception('ERROR: {}\n'.format(exc)) self._cache_hit = cache_hit return html
[docs] async def get_soup(self, url, cache): """ Get the html soup of a query :rtype: `bs4.element.ResultSet` """ html = await self.get_source(url, cache) return BeautifulSoup(html, 'lxml')
[docs] def get_search_url(self, query=None, page=None, **kwargs): """ Return a formatted search url """ # Some URLs use offsets offset = (page * 10) - 9 params = self.get_params( query=query, page=page, offset=offset, **kwargs) url = urlparse(self.search_url) # For localization purposes, custom urls can be parsed for the same engine # such as google.de and google.com if kwargs.get("url"): new_url = urlparse(kwargs.pop("url")) # When passing url without scheme e.g google.de, url is parsed as path if not new_url.netloc: url = url._replace(netloc=new_url.path) else: url = url._replace(netloc=new_url.netloc) self.base_url = url.geturl() self._parsed_url = url._replace(query=urlencode(params)) return self._parsed_url.geturl()
[docs] def get_results(self, soup, **kwargs): """ Get results from soup""" search_results = None results = self.parse_soup(soup) # TODO Check if empty results is caused by traffic or answers to query # were not found if not results: print("ENGINE FAILURE: {}\n".format(self.name)) raise NoResultsOrTrafficError( "The result parsing was unsuccessful. It is either your query could not be found" " or it was flagged as unusual traffic") try: search_results = self.parse_result(results, **kwargs) # AttributeError occurs as it cannot pass the returned soup except AttributeError as e: raise NoResultsOrTrafficError( "The returned results could not be parsed. This might be due to site updates or " "server errors. Drop an issue at https://github.com/bisoncorps/search-engine-parser" " if this persists" ) return search_results
[docs] def search(self, query=None, page=1, cache=True, **kwargs): """ Query the search engine :param query: the query to search for :type query: str :param page: Page to be displayed, defaults to 1 :type page: int :return: dictionary. Containing titles, links, netlocs and descriptions. """ # Pages can only be from 1-N if page <= 0: page = 1 # Get search Page Results loop = asyncio.get_event_loop() url = self.get_search_url( query, page, **kwargs) soup = loop.run_until_complete( self.get_soup(url, cache=cache)) return self.get_results(soup, **kwargs)