Source code for search_engine_parser.core.engines.google

"""@desc
		Parser for google search results
"""
import sys
from urllib.parse import urljoin

from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem


[docs]class Search(BaseSearch):
    """
    Searches Google for string
    """
    name = "Google"
    base_url = "https://www.google.com/"
    summary = "\tNo need for further introductions. The search engine giant holds the first "\
        "place in search with a stunning difference of 65% from second in place Bing.\n"\
        "\tAccording to the latest netmarketshare report (November 2018) 73% of searches "\
        "were powered by Google and only 7.91% by Bing.\n\tGoogle is also dominating the "\
        "mobile/tablet search engine market share with 81%!"

    def __init__(self):
        super().__init__()
        self.search_url = urljoin(self.base_url, "search")

[docs]    def get_params(self, query=None, offset=None, page=None, **kwargs):
        params = {}
        params["start"] = (page-1) * 10
        params["q"] = query
        params["gbv"] = 1
        return params

[docs]    def parse_url(self, url):
        return urljoin(self.base_url, url)

[docs]    def parse_soup(self, soup):
        """
        Parses Google Search Soup for results
        """
        # find all class_='g' => each result
        return soup.find_all('div', class_="ZINbbc xpd O9g5cc uUPGi")

[docs]    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="g">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        # Some unneeded details shown such as suggestions should be ignore
        if (single_result.find("h2", class_="wITvVb") and single_result.find("div", class_="LKSyXe"))\
                or single_result.find("div", class_="X7NTVe"):
            return

        results = SearchItem()
        els = single_result.find_all('div', class_='kCrYT')
        if len(els) < 2:
            return

        # First div contains title and url
        r_elem = els[0]

        # Get the text and link
        if return_type in (ReturnType.FULL, return_type.TITLE):
            h3_tag = r_elem.find('h3')
            if h3_tag:
                title = h3_tag.text
            else:
                title = r_elem.find('div', class_='BNeawe').text
            results['titles'] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = r_elem.find('a')
            if link_tag:
                raw_link = link_tag.get('href')
                results['links'] = self.parse_url(raw_link)

        if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
            # Second Div contains Description
            desc_tag = els[1]
            if return_type in (ReturnType.FULL, ReturnType.LINK) and not results.get('links'):
                link_tag = desc_tag.find('a')
                if link_tag:
                    raw_link = link_tag.get('href')
                    results['links'] = self.parse_url(raw_link)
            desc = desc_tag.text
            results['descriptions'] = desc
        return results