""" Add new scrapers here. Please follow these steps to do so:
- Create a class whose names ends with `Scraper`, e.g: `YourScrapper`\
(although it should be explicit which website it crawls).
- Make that class inherit from `Scraper`
- Call for `super()` in its constructor, and pass it the URL of the webpage\
to crawl and the `playlist_id` to upload the songs to. e.g:
.. code-block:: python
player_url = 'https://radio.com/awesome-song-history'
playlist_id = '3BCcE8T945z1MnfPWkFsfX'
super(YourScrapper, self).__init__(player_url, playlist_id)
- Overide the `get_song_history` method, the first row should be:
.. code-block:: python
soup, driver = self.scrap_webpage()
- Add your scraper in the [tests](./tests/test_scraping.py) folder:
.. code-block:: python
class TestYourScraper(GenericScraperTest):
scraper = scraping.YourScraper()
- Add your scraper in the
[src.playlist_updater.Updater](./src/playlist_updater.py) class:
.. code-block:: python
self.scrapers = [
scraping.KSHEScraper(),
scraping.EagleScraper(),
scraping.YourScraper() # New scraper!
]
- You're all set!
"""
import logging
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup
from selenium import webdriver
from .exceptions import NoHistoryFound
[docs]class Scraper(ABC):
def __init__(self, player_url, playlist_id):
self.name = self.__class__.__name__
self.player_url = player_url
self.playlist_id = playlist_id
logging.info('Scraper initialized. Using {0}'.format(self.player_url))
[docs] def scrap_webpage(self):
"""Scrap the webpage. This function must be called first in the
``get_song_history`` implementation.
Returns:
tuple: soup and driver
"""
options = webdriver.FirefoxOptions()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get(self.player_url)
soup = BeautifulSoup(driver.page_source, "lxml")
return soup, driver
[docs] @abstractmethod
def get_song_history(self):
"""Scrap the website and get its song history.
This function must be overiden. Its implementation must return
a list of dict with the following keys:
- title
- artist
- timestamp (can be null, it's not used so far)
"""
pass
[docs]class KSHEScraper(Scraper):
def __init__(self):
player_url = 'https://live.kshe95.com/listen/?'\
'utm_source=station-website&utm_medium=widget'\
'&utm_campaign=now-playing'
playlist_id = '3BCcE8T945z1MnfPWkFsfX'
super(KSHEScraper, self).__init__(player_url, playlist_id)
[docs] def get_song_history(self):
soup, driver = self.scrap_webpage()
recently_played = soup.find_all("li", {"class": "hll-recent-track"})
if not recently_played:
raise NoHistoryFound()
history = []
for recently_played_item in recently_played:
div = recently_played_item.find("div", {"class": "caption"})
div = div.find("div", {"class": "vertical-align"})
# get song infos
h3 = div.find("h3")
title = h3.find(
"a",
{"class": "hll-link-color-hover ember-view"}
).text.strip().lower()
cite = div.find("cite")
artist = cite.find(
"a",
{"class": "hll-link-color-hover ember-view"}
).text.strip().lower()
# get song played time
song_timestamp = recently_played_item.find(
"div",
{"class": "time"}
).time["datetime"]
result = {"title": title, "artist": artist, "timestamp": song_timestamp}
logging.info(result)
history.append(result)
driver.quit()
return history
[docs]class EagleScraper(Scraper):
def __init__(self):
player_url = 'https://eagle969.radio.com/playlist'
playlist_id = '3BCcE8T945z1MnfPWkFsfX'
super(EagleScraper, self).__init__(player_url, playlist_id)
[docs] def get_song_history(self):
soup, driver = self.scrap_webpage()
recently_played = soup.find_all("div", {"class": "ts-track-item"})
if not recently_played:
raise NoHistoryFound()
history = []
for recently_played_item in recently_played:
title = recently_played_item.find(
"div",
{"class": "ts-song-title tagstation__song"}
).text.strip().lower()
artist = recently_played_item.find(
"div",
{"class": "ts-artist tagstation__artist"}
).text.strip().lower()
result = {"title": title, "artist": artist, "timestamp": None}
logging.info(result)
history.append(result)
driver.quit()
return history
[docs]class Q1043Scrapper(Scraper):
def __init__(self):
player_url = 'https://q1043.iheart.com/music/recently-played/'
playlist_id = '3BCcE8T945z1MnfPWkFsfX'
super(Q1043Scrapper, self).__init__(player_url, playlist_id)
[docs] def get_song_history(self):
soup, driver = self.scrap_webpage()
recently_played = soup.find_all(
"li",
{"class": "playlist-track-container ondemand-track"}
)
if not recently_played:
raise NoHistoryFound()
history = []
for recently_played_item in recently_played:
title = recently_played_item.find(
"a",
{"class": "song-title"}
).text.strip().lower()
artist = recently_played_item.find(
"a",
{"class": "artist-name"}
).text.strip().lower()
result = {"title": title, "artist": artist, "timestamp": None}
logging.info(result)
history.append(result)
driver.quit()
return history
[docs]class WMGKScrapper(Scraper):
def __init__(self):
player_url = 'https://wmgk.com/stream/WMGKFM/'
playlist_id = '3BCcE8T945z1MnfPWkFsfX'
super(WMGKScrapper, self).__init__(player_url, playlist_id)
[docs] def get_song_history(self):
soup, driver = self.scrap_webpage()
recently_played = soup.find(
"div",
{"class": "song-archive"}
).find_all("li")
if not recently_played:
raise NoHistoryFound()
history = []
for recently_played_item in recently_played:
title = recently_played_item.find(
"span",
{"class": "song-title"}
).text.strip().lower()
artist = recently_played_item.find(
"span",
{"class": "song-artist"}
).text.strip().lower()
result = {"title": title, "artist": artist, "timestamp": None}
logging.info(result)
history.append(result)
driver.quit()
return history
[docs]class KLOScrapper(Scraper):
def __init__(self):
player_url = 'http://klos.tunegenie.com/onair/'
playlist_id = '3BCcE8T945z1MnfPWkFsfX'
super(KLOScrapper, self).__init__(player_url, playlist_id)
[docs] def get_song_history(self):
soup, driver = self.scrap_webpage()
recently_played = soup.find(
"ul",
{"class": "slots currentonair onairlist"}
).find_all("li", {"class": "slot lt"})
if not recently_played:
raise NoHistoryFound()
history = []
for recently_played_item in recently_played:
left = recently_played_item.find("div", {"class": "left"})
title = left.find(
"div",
{"class": "song"}
).text.strip().lower()
artist = left.find(
"div",
{"class": ""}
).text.strip().lower()
result = {"title": title, "artist": artist, "timestamp": None}
logging.info(result)
history.append(result)
driver.quit()
return history