Source code for src.scraping

""" Add new scrapers here. Please follow these steps to do so:

- Create a class whose names ends with `Scraper`, e.g: `YourScrapper`\
    (although it should be explicit which website it crawls).
- Make that class inherit from `Scraper`
- Call for `super()` in its constructor, and pass it the URL of the webpage\
    to crawl and the `playlist_id` to upload the songs to. e.g:

    .. code-block:: python

        player_url = 'https://radio.com/awesome-song-history'
        playlist_id = '3BCcE8T945z1MnfPWkFsfX'
        super(YourScrapper, self).__init__(player_url, playlist_id)

- Overide the `get_song_history` method, the first row should be:

    .. code-block:: python

        soup, driver = self.scrap_webpage()

- Add your scraper in the [tests](./tests/test_scraping.py) folder:

    .. code-block:: python

        class TestYourScraper(GenericScraperTest):
            scraper = scraping.YourScraper()

- Add your scraper in the
  [src.playlist_updater.Updater](./src/playlist_updater.py) class:

    .. code-block:: python

        self.scrapers = [
            scraping.KSHEScraper(),
            scraping.EagleScraper(),
            scraping.YourScraper()  # New scraper!
        ]

- You're all set!
"""

import logging
from abc import ABC, abstractmethod

from bs4 import BeautifulSoup
from selenium import webdriver

from .exceptions import NoHistoryFound


[docs]class Scraper(ABC):
    def __init__(self, player_url, playlist_id):
        self.name = self.__class__.__name__
        self.player_url = player_url
        self.playlist_id = playlist_id

        logging.info('Scraper initialized. Using {0}'.format(self.player_url))

[docs]    def scrap_webpage(self):
        """Scrap the webpage. This function must be called first in the
        ``get_song_history`` implementation.

        Returns:
            tuple: soup and driver
        """
        options = webdriver.FirefoxOptions()
        options.headless = True
        driver = webdriver.Firefox(options=options)

        driver.get(self.player_url)
        soup = BeautifulSoup(driver.page_source, "lxml")

        return soup, driver

[docs]    @abstractmethod
    def get_song_history(self):
        """Scrap the website and get its song history.
        This function must be overiden. Its implementation must return
        a list of dict with the following keys:

        - title
        - artist
        - timestamp (can be null, it's not used so far)
        """
        pass


[docs]class KSHEScraper(Scraper):
    def __init__(self):
        player_url = 'https://live.kshe95.com/listen/?'\
                     'utm_source=station-website&utm_medium=widget'\
                     '&utm_campaign=now-playing'
        playlist_id = '3BCcE8T945z1MnfPWkFsfX'

        super(KSHEScraper, self).__init__(player_url, playlist_id)

[docs]    def get_song_history(self):
        soup, driver = self.scrap_webpage()

        recently_played = soup.find_all("li", {"class": "hll-recent-track"})

        if not recently_played:
            raise NoHistoryFound()

        history = []

        for recently_played_item in recently_played:
            div = recently_played_item.find("div", {"class": "caption"})
            div = div.find("div", {"class": "vertical-align"})

            # get song infos
            h3 = div.find("h3")
            title = h3.find(
                "a",
                {"class": "hll-link-color-hover ember-view"}
            ).text.strip().lower()

            cite = div.find("cite")
            artist = cite.find(
                "a",
                {"class": "hll-link-color-hover ember-view"}
            ).text.strip().lower()

            # get song played time
            song_timestamp = recently_played_item.find(
                "div",
                {"class": "time"}
            ).time["datetime"]

            result = {"title": title, "artist": artist, "timestamp": song_timestamp}
            logging.info(result)
            history.append(result)

        driver.quit()

        return history


[docs]class EagleScraper(Scraper):
    def __init__(self):
        player_url = 'https://eagle969.radio.com/playlist'
        playlist_id = '3BCcE8T945z1MnfPWkFsfX'

        super(EagleScraper, self).__init__(player_url, playlist_id)

[docs]    def get_song_history(self):
        soup, driver = self.scrap_webpage()

        recently_played = soup.find_all("div", {"class": "ts-track-item"})

        if not recently_played:
            raise NoHistoryFound()

        history = []

        for recently_played_item in recently_played:
            title = recently_played_item.find(
                "div",
                {"class": "ts-song-title tagstation__song"}
            ).text.strip().lower()

            artist = recently_played_item.find(
                "div",
                {"class": "ts-artist tagstation__artist"}
            ).text.strip().lower()

            result = {"title": title, "artist": artist, "timestamp": None}
            logging.info(result)
            history.append(result)

        driver.quit()

        return history


[docs]class Q1043Scrapper(Scraper):
    def __init__(self):
        player_url = 'https://q1043.iheart.com/music/recently-played/'
        playlist_id = '3BCcE8T945z1MnfPWkFsfX'

        super(Q1043Scrapper, self).__init__(player_url, playlist_id)

[docs]    def get_song_history(self):
        soup, driver = self.scrap_webpage()

        recently_played = soup.find_all(
            "li",
            {"class": "playlist-track-container ondemand-track"}
        )

        if not recently_played:
            raise NoHistoryFound()

        history = []

        for recently_played_item in recently_played:
            title = recently_played_item.find(
                "a",
                {"class": "song-title"}
            ).text.strip().lower()

            artist = recently_played_item.find(
                "a",
                {"class": "artist-name"}
            ).text.strip().lower()

            result = {"title": title, "artist": artist, "timestamp": None}
            logging.info(result)
            history.append(result)

        driver.quit()

        return history


[docs]class WMGKScrapper(Scraper):
    def __init__(self):
        player_url = 'https://wmgk.com/stream/WMGKFM/'
        playlist_id = '3BCcE8T945z1MnfPWkFsfX'

        super(WMGKScrapper, self).__init__(player_url, playlist_id)

[docs]    def get_song_history(self):
        soup, driver = self.scrap_webpage()

        recently_played = soup.find(
            "div",
            {"class": "song-archive"}
        ).find_all("li")

        if not recently_played:
            raise NoHistoryFound()

        history = []

        for recently_played_item in recently_played:
            title = recently_played_item.find(
                "span",
                {"class": "song-title"}
            ).text.strip().lower()

            artist = recently_played_item.find(
                "span",
                {"class": "song-artist"}
            ).text.strip().lower()

            result = {"title": title, "artist": artist, "timestamp": None}
            logging.info(result)
            history.append(result)

        driver.quit()

        return history


[docs]class KLOScrapper(Scraper):
    def __init__(self):
        player_url = 'http://klos.tunegenie.com/onair/'
        playlist_id = '3BCcE8T945z1MnfPWkFsfX'

        super(KLOScrapper, self).__init__(player_url, playlist_id)

[docs]    def get_song_history(self):
        soup, driver = self.scrap_webpage()

        recently_played = soup.find(
            "ul",
            {"class": "slots currentonair onairlist"}
        ).find_all("li", {"class": "slot lt"})

        if not recently_played:
            raise NoHistoryFound()

        history = []

        for recently_played_item in recently_played:
            left = recently_played_item.find("div", {"class": "left"})
            title = left.find(
                "div",
                {"class": "song"}
            ).text.strip().lower()

            artist = left.find(
                "div",
                {"class": ""}
            ).text.strip().lower()

            result = {"title": title, "artist": artist, "timestamp": None}
            logging.info(result)
            history.append(result)

        driver.quit()

        return history