Source code for search_engine_parser.core.engines.youtube

"""@desc
		Parser for YouTube search results
"""
from search_engine_parser.core.base import BaseSearch, ReturnType, SearchItem


[docs]class Search(BaseSearch):
    """
    Searches YouTube for string
    """
    name = "YouTube"
    base_url = "https://youtube.com"
    search_url = base_url + "/results?"
    summary = "\tYouTube is an American video-sharing website headquartered in San Bruno, "\
        "California. Three former PayPal employees—Chad Hurley, Steve Chen, and Jawed "\
        "Karim—created the service in February 2005.\n\tGoogle bought the site in November "\
        "2006 for US$1.65 billion; YouTube now operates as one of Google's subsidiaries. "\
        "As of May 2019, more than 500 hours of video content are uploaded to YouTube every minute"

[docs]    def get_params(self, query=None, page=None, offset=None, **kwargs):
        params = {}
        params["search_query"] = query
        return params

[docs]    def parse_soup(self, soup):
        """
        Parses YouTube for a search query.
        """
        # find all ytd-video-renderer tags
        return soup.find_all('div', class_='yt-lockup-content')

[docs]    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <ytd-video-renderer class="style-scope">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        # pylint: disable=too-many-locals
        title_tag = single_result.find('a', class_='yt-uix-tile-link')
        channel_name = ""

        if return_type in (ReturnType.FULL, return_type.TITLE):
            # Get the text and link
            rdict["titles"] = title_tag.text

        # try for single videos
        try:
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
                desc = single_result.find(
                    'div', class_="yt-lockup-description").text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                duration = single_result.find(
                    'span', class_='accessible-description').text
                ul_tag = single_result.find('ul', class_='yt-lockup-meta-info')

                channel_name = single_result.find(
                    'a', class_='yt-uix-sessionlink spf-link').text
                views_and_upload_date = ul_tag.find_all('li')
                upload_date = views_and_upload_date[0].text
                views = views_and_upload_date[1].text
                rdict.update({
                    "channels": channel_name,
                    "durations": duration,
                    "views": views,
                    "upload_dates": upload_date,
                })
        except BaseException:  # pylint: disable=broad-except
            link_tags = single_result.find_all(
                'a', class_='yt-uix-sessionlink spf-link')
            # TODO Optimize calls here so that we don't assign ref_link and channel_name
            # when we don't need them
            for i in link_tags:
                if i.get("href").startswith("/playlist"):
                    ref_link = i.get("href")
                elif i.get("href").startswith("/user"):
                    channel_name = i.text
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find(
                    'span', class_='accessible-description').text
                rdict["descriptions"] = desc
            if return_type in (ReturnType.FULL,):
                rdict.update({
                    "channels": channel_name,
                })
        return rdict