Source code for search_engine_parser.core.engines.baidu

"""@desc
		Parser for Baidu search results
"""

import re

from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem


[docs]class Search(BaseSearch):
    """
    Searches Baidu for string
    """
    name = "Baidu"
    search_url = "https://www.baidu.com/s?"
    summary = "\tBaidu, Inc. is a Chinese multinational technology company specializing in"\
        " Internet-related services and products and artificial intelligence (AI), headquartered"\
        " in Beijing's Haidian District.\n\tIt is one of the largest AI and internet"\
        " companies in the world.\n\tBaidu offers various services, including a"\
        " Chinese search engine, as well as a mapping service called Baidu Maps."

    """Override get_search_url"""

[docs]    def get_params(self, query=None, page=None, offset=None, **kwargs):
        params = {}
        params["wd"] = query
        params["pn"] = (page - 1) * 10
        params["oq"] = query
        return params

[docs]    def parse_soup(self, soup):
        """
        Parses Baidu for a search query
        """

        # Baidu search can be made deterministic via an id
        # Hence, a regex is used to match all eligible ids

        return soup.find_all('div', {'id': re.compile(r"^\d{1,2}")}, class_="c-container")

[docs]    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in div with a numeric id
        :type single_result: `bs4.element.Tag`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        if return_type in (ReturnType.FULL, return_type.TITLE):
            h3_tag = single_result.find('h3')
            rdict["title"] = h3_tag.text

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = single_result.find('a')
            # Get the text and link
            rdict["links"] = link_tag.get('href')

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            desc = single_result.find('div', class_='c-abstract') 
            rdict["descriptions"] = desc if desc else ''        
            return rdict