Source code for scrapers.lyrics_scraper

"""Module that defines the base class for scraping lyrics websites and saving
the scraped content.

More specifically, the derived classes (e.g.
:class:`~scrapers.azlyrics_scraper.AZLyricsScraper`) are the ones that do the
actual scraping of the lyrics webpages.

By default, the scraped data is saved in a dictionary (see the variable
:data:`~LyricsScraper.scraped_data`).

The scraped data can also be saved in a database if a path to the SQLite
database is given via the argument :ref:`db_filepath
<LyricsScraperParametersLabel>`.

See the structure of the music database as defined in the `music.sql schema`_.

.. _guide: https://bit.ly/2xYreie
.. _HTTP GET request: https://www.webopedia.com/TERM/H/HTTP_request_header.html
.. _music.sql schema: https://bit.ly/2kIMYvn
.. _saveutils.py: https://bit.ly/2m5z46A
.. _saveutils.SaveWebpages: https://bit.ly/2oaz7Px
.. _scraper.py: https://bit.ly/2msZDTC
.. _use a specialized library: https://stackoverflow.com/a/56476496
.. _YAML logging file: https://bit.ly/2m5wjSM

"""

import logging
import os
import random
import sqlite3
# NOTE:
# For urllib with Python 2, it is
# from six.moves.urllib.parse import urlparse
import urllib
from logging import NullHandler
from urllib.request import urlopen
from urllib.parse import urlparse

import lyrics_scraping.exceptions
import pyutils.exceptions
from lyrics_scraping.utils import plural, get_data_filepath
from pyutils.dbutils import connect_db, create_db, sql_sanity_checks
from pyutils.genutils import create_dir
from pyutils.logutils import get_error_msg, setup_logging_from_cfg
from pyutils.webcache import WebCache

logger = logging.getLogger(__name__)
logger.addHandler(NullHandler())


_SETUP_LOGGING = True


[docs]class LyricsScraper: """Base class for scraping and saving webpages locally. This class is responsible for doing lots of configuration before the web scraping starts, such as setting up logging and the database. The actual scraping of the lyrics websites is done by the derived classes (e.g. :class:`~scrapers.azlyrics_scraper.AZLyricsScraper`) since each lyrics websites have their own way of being crawled (they are all designed differently). However, the base class is responsible for saving the scraped data in a dictionary (:data:`~LyricsScraper.scraped_data`) and in a database (if it was initially make configured). .. _LyricsScraperParametersLabel: Parameters ---------- lyrics_urls : list [str] List of URLs to lyrics webpages which will be scraped. db_filepath : str, optional File path to the SQLite music database (the default value is :obj:`None` which implies that no database will be used. The scraped data will be saved only in the :data:`~LyricsScraper.scraped_data` dictionary). autocommit : bool, optional Whether the changes to the database are committed right away (the default is False which implies that the changes won't take effect immediately). overwrite_db : bool, optional Whether the database will be overwritten. The user is given some time to stop the script before the database is overwritten (the default value is False). update_tables : bool, optional Whether the tables in the database can be updated (the default value is False). cache_dirpath : str, optional Path to the cache directory where webpages are saved (the default value is :obj:`None` which implies that the cache will not be used). overwrite_webpages : bool, optional Whether the webpages saved in cache can be overwritten (the default value is False). http_get_timeout : int, optional Timeout when a GET request doesn't receive any response from the server. After the timeout expires, the GET request is dropped (the default value is 5 seconds). delay_between_requests : int, optional A delay will be added between HTTP requests in order to reduce the workload on the server (the default value is 8 seconds which implies that there will be a delay of 8 seconds between successive HTTP requests). headers : dict, optional The information added to the `HTTP GET request`_ that a user's browser sends to a Web server containing the details of what the browser wants and will accept back from the server. (the default value is defined in :obj:`saveutils.SaveWebpages.headers`). use_logging : bool, optional Whether to log messages on console and file. The logging is setup according to the `YAML logging file`_ (the default value is False which implies that no logging will be used and thus no messages will be printed on the console). **kwargs : dict TODO Attributes ---------- skipped_urls : dict [str, str] Stores the URLs that were skipped because of an error such as :exc:`OSError` or :exc:`~exceptions.connection.HTTP404Error`, along with the error message. The keys are the URLs and the values are the associated error messages. good_urls : set Stores the unique URLs that were successfully processed and saved. checked_urls : set Stores the unique URLs that were processed (whether successfully or unsuccessfully) during the current session. Thus, `checked_urls` should equal to `skipped_urls` + `good_urls`. db_conn : sqlite3.Connection SQLite database connection. saver : :class:`saveutils.SaveWebpages` For retrieving webpages and saving them in cache. See :mod:`saveutils`. valid_domains : list Only URLs from these domains will be processed. logging_filepath : str Path to the `YAML logging file`_ which is used to setup logging for all custom modules. schema_filepath : str Path to `music.sql schema`_ for building the music database which will store the scraped data. scraped_data : dict The scraped data is saved as a dictionary. Its structure is based on the database's `music.sql schema`_. Notes ----- If the corresponding flags are activated, logging and database are setup in :meth:`__init__`. By default, the scraped data is saved in a dictionary whose structure is described below (see :data:`~LyricsScraper.scraped_data`). The scraped data will also be saved if a database is given via :ref:`db_filepath <LyricsScraperParametersLabel>`. See the structure of the music database as defined in the `music.sql schema`_. The scraped webpages can also be cached in order to reduce the number of HTTP requests to the server (See :ref:`db_filepath <LyricsScraperParametersLabel>`). """ valid_domains = ["www.azlyrics.com"] scraped_data = { 'albums': { 'headers': ('album_title', 'artist_name', 'year',), 'data': [] }, 'artists': { 'headers': ('artist_name',), 'data': [] }, 'songs': { 'headers': ('song_title', 'artist_name', 'album_title', 'lyrics_url', 'lyrics', 'year',), 'data': [] } } """The scraped data is saved as a dictionary. .. _scraped-data-Label: Its keys and values are defined as follow: .. code:: python scraped_data = { 'albums': { 'headers': ('album_title', 'artist_name', 'year',), 'data': [] }, 'artists': { 'headers': ('artist_name',), 'data': [] }, 'songs': { 'headers': ('song_title', 'artist_name', 'album_title', 'lyrics_url', 'lyrics', 'year',), 'data': [] } } .. note:: The 'data' key points to a list of tuple that eventually will store the scraped data from different URLs, i.e. each scraped data from a given URL is added as a tuple to the list. """ # TODO: add example of data. def __init__(self, db_filepath="", overwrite_db=False, use_webcache=True, webcache_dirpath="~/.cache/lyric_scraping/", expire_after=25920000, use_compute_cache=True, ram_size=100, http_get_timeout=5, delay_between_requests=8, headers=WebCache.HEADERS, seed=123456, interactive=False, delay_interactive=30, best_match=False, simulate=False, ignore_errors=False): self.skipped_urls = {} self.good_urls = set() self.checked_urls = set() # TODO: AssertionError are raised in both lines self.logging_cfg_filepath = get_data_filepath(file_type='log') self.schema_filepath = get_data_filepath(file_type='schema') # ============== # Logging config # ============== if _SETUP_LOGGING: # Setup logging for all custom modules based on the default logging # config file logger.debug("<color>Setting up logging ...</color>") setup_logging_from_cfg(self.logging_cfg_filepath) logger.info("<color>Logging is setup</color>") else: logger.warning("<color>Logging was already setup</color>") # =============== # Database config # =============== self.overwrite_db = overwrite_db self.db_filepath = os.path.expanduser(db_filepath) # TODO: remove db_conn from everywhere self.db_conn = None if self.db_filepath: logger.debug("<color>Setting up the music database ...</color>") # Create music db if necessary # TODO: IOError and sqlite3.OperationalError are raised create_db(self.db_filepath, self.schema_filepath, self.overwrite_db) logger.info("<color>Music database is setup</color>") else: # No database to fbe used logger.debug("<color>No music database used</color>") # ================ # Web cache config # ================ self.webcache_dirpath = os.path.expanduser(webcache_dirpath) self.cache_name = os.path.join(self.webcache_dirpath, "cache") self.use_webcache = use_webcache self.expire_after = expire_after self.http_get_timeout = http_get_timeout self.delay_between_requests = delay_between_requests self.headers = headers if self.use_webcache: logger.debug("<color>Setting up web-cache ...</color>") logger.debug("<color>Creating the web-cache directory: " "{}</color>".format(self.webcache_dirpath)) try: # TODO: FileExistsError and PermissionError are raised create_dir(self.webcache_dirpath, overwrite=False) except FileExistsError as e: logger.debug("<color>{}</color>".format(e)) logger.debug("<color>The webcache directory already exists: " "{}</color>".format(self.webcache_dirpath)) self.webcache = WebCache( cache_name=self.cache_name, expire_after=self.expire_after, http_get_timeout=self.http_get_timeout, delay_between_requests=self.delay_between_requests, headers=self.headers) logger.info("<color>web-cache is setup</color>") else: self.webcache = None logger.debug("<color>No web-cache used</color>") # ==================== # Compute cache config # ==================== self.use_compute_cache = use_compute_cache self.ram_size = ram_size if self.use_compute_cache: logger.debug("<color>Setting up compute-cache ...</color>") self.compute_cache = ComputeCache(self.schema_filepath, self.ram_size) logger.info("<color>compute-cache is setup</color>") else: self.compute_cache = None logger.debug("<color>No compute-cache used</color>") # ============== # Scraper config # ============== self.seed = seed random.seed(self.seed) logger.info("<color>Random number generator initialized with seed={}" "</color>".format(self.seed)) self.interactive = interactive self.delay_interactive = delay_interactive self.best_match = best_match self.simulate = simulate self.ignore_errors = ignore_errors self.min_year = 1000
[docs] def get_song_lyrics(self, song_title, artist_name=None): """TODO Parameters ---------- song_title artist_name Returns ------- """ # TODO: add message raise NotImplementedError("")
[docs] def get_lyrics_from_album(self, album_title, artist_name=None, max_songs=None): """TODO Parameters ---------- album_title artist_name max_songs Returns ------- """ # TODO: add message raise NotImplementedError("")
[docs] def get_lyrics_from_artist(self, artist_name, max_songs=None, year_after=None, year_before=None): """TODO Parameters ---------- artist_name max_songs year_after year_before Returns ------- """ # TODO: add message raise NotImplementedError("")
[docs] def search_song_lyrics(self, song_title, artist_name=None): """TODO Parameters ---------- song_title artist_name Returns ------- """ # TODO: add message raise NotImplementedError("")
[docs] def search_album(self, album_title, artist_name=None): """TODO Parameters ---------- album_title artist_name Returns ------- """ # TODO: add message raise NotImplementedError("")
[docs] def search_artist(self, artist_name=None): """TODO Parameters ---------- artist_name Returns ------- """ # TODO: add message raise NotImplementedError("")
[docs] def start_scraping(self): """Start the web scraping of lyrics websites. This method iterates through each lyrics URL from the main config file and delegates the important tasks (URL processing and scraping) to separate methods (:meth:`_process_url` and :meth:`_scrape_webpage`). Notes ----- This method catches all exceptions that prevent a given URL of being processed further, e.g. the webpage is not found (404 Error) or the URL is not from a valid domain. Any exception that is not caught here is redirected to the main script calling this method. See for example the main script :mod:`scripts.scraper`. """ # Process list of URLs to lyrics websites for url in self.lyrics_urls: skip_url = True error = None try: webpage_filename = self._process_url(url) self._scrape_webpage(url, webpage_filename) except OSError as e: logger.exception(e) error = e except urllib.error.URLError as e: logger.exception(e) logger.warning("The URL {} seems to be down!".format(url)) error = e except (FileExistsError, lyrics_scraping.exceptions.CurrentSessionURLError, lyrics_scraping.exceptions.InvalidURLDomainError, lyrics_scraping.exceptions.InvalidURLCategoryError, lyrics_scraping.exceptions.MultipleLyricsURLError, lyrics_scraping.exceptions.OverwriteSongError, pyutils.exceptions.HTTP404Error, pyutils.exceptions.SQLSanityCheckError) as e: logger.error(e) error = e else: skip_url = False finally: # Close db connection self.db_conn.close() # Add the URL as skipped or good if skip_url: self._add_skipped_url(url, get_error_msg(error)) else: logger.debug("URL successfully processed: " "{}".format(url)) self.good_urls.add(url)
[docs] def get_scraped_data(self): """Return the scraped data as a dictionary. This method returns all the data that was scraped from the lyrics webpages. If a database was used, the scraped data is also saved in the SQLite database file found at :ref:`db_filepath <LyricsScraperParametersLabel>` See :ref:`scraped_data <scraped-data-Label>` for a detailed structure of the returned dictionary. Returns ------- scraped_data : dict The scraped data whose content is described in :data:`~scrapers.lyrics_scraper.LyricsScraper.scraped_data`. """ # If a db was used, inform the user that the scraped data is also to be # found in the SQLite database that was initially configured. if self.db_conn: logger.info("The scraped data is also saved in the database " "'{}'".format(self.db_filepath)) return self.scraped_data
def _add_skipped_url(self, url, error): """Add an URL as skipped. The skipped URL is added to a dictionary along with its error message which explains why it was skipped. Parameters ---------- url : str The skipped URL which will be added to the dictionary along with its corresponding error message. exc : Exception The error message as an :exc:`Exception`, e.g. :exc:`TypeError`, which will be converted to a string and added to the dictionary along with its corresponding URL. """ logger.warning("Skipping the URL {}".format(url)) self.skipped_urls.setdefault(url, []) self.skipped_urls[url].append(str(error)) def _url_already_processed(self, url): """Check if an URL was already processed. First, the URL is checked if it was already processed during the current session. Then, the URL is checked if is already present in the database (if a database is used). By doing these checks, we reduce a lot of computations that would have been unnecessary, like scraping and saving an already processed webpage. Parameters ---------- url : str The URL to be checked if it was previously processed. Returns ------- TODO """ retcode = 1 # First, check if the URL was already processed during current session if url in self.checked_urls: logger.warning("The URL was already processed during this " "session: {}".format(url)) elif self.db_filepath and self._url_in_db(url) == 2: # The URL was found in the db retcode = 2 else: # URL is brand new! Thus, it can be further processed. retcode = 0 logger.debug("The URL was not previously processed: {}".format(url)) self.checked_urls.add(url) return retcode def _url_in_db(self, url): """Check if an URL is already present in the database. When processing a given artist or lyrics URL, we check if it is already in the db. Hence, we speed up the program execution by not processing the same URL again. However, if the option `overwrite_db` is set to True, then the URL will be processed again. Parameters ---------- url : str URL to be checked if it is already in the db. Returns ------- TODO Raises ------ MultipleLyricsURLError Raised if an URL was found more than once in the music db. """ retcode = 1 # Select all songs with the given URL from the music db res = self._select_song_from_url(url) if len(res) == 1: # Only one song found with the given URL logger.debug("There is already a song with the same URL: " "{}".format(url)) # Check if the song found with the given URL can be updated # (overwritten) in the table if self.overwrite_db: retcode = 2 # Song can be updated logger.debug("Since the 'overwrite_db' flag is set to True, " "the URL will be processed and the music db will " "be updated as a consequence") else: # Song can't be updated # TODO: it should be a warning logger.debug("Since the 'overwrite_db' flag is set to False, " "the URL will be ignored") elif len(res) == 0: # No song found with the given URL retcode = 0 logger.debug("The song URL was not found in the music " "db: {}".format(url)) else: # Odd case: more than one song was found with the given URL raise lyrics_scraping.exceptions.MultipleLyricsURLError( "The song URL was found more than once in the music " "db: {}".format(url)) return retcode @staticmethod def _count_empty_items(data): """Count empty items in a tuple. Returns the number of empty items in a list or tuple which can be empty strings or :obj:`None`. Parameters ---------- data : list or tuple The tuple whose content will be checked for number of empty items. Returns ------- count : int The number of empty items (empty strings or :obj:`None`) in the tuple. Notes ----- A warning that empty items are found is logged. """ # Count number of empty items in the list/tuple count = sum([1 for f in data if not f]) if count: # At least one empty item found logger.warning("Empty field{}: {}".format(plural(count), data)) return count def _process_url(self, url): """Process each URL defined in the YAML config file. The URLs can refer to an artist or lyrics webpage. In order to reduce the number of HTTP requests to the lyrics website, the URL is first checked if it has already been processed. Parameters ---------- url : str URL to the artist's or lyrics webpage that will be scraped. Returns ------- webpage_filepath : str File path where the webpage's HTML will be cached. Raises ------ InvalidURLDomainError Raised if the URL is not from a valid domain. See :data:`~LyricsScraper.valid_domains`. Notes ----- There is a more robust parsing of the top-level domain: `use a specialized library`_ (e.g. tldextract). For example, `urlparse` will not be able to extract the right domain from a more complex URL such as 'http://forums.news.cnn.com/'. On the other hand, `tldextract` will output 'cnn' which is correct. """ logger.info("Processing the URL {}".format(url)) # Check first if the URL was already processed, e.g. is found in the db self._check_url_if_processed(url) domain = urlparse(url).netloc # Get the name of the directory in cache where the webpages are/will be # saved if self.cache_dirpath: # Cache to be used. Webpages will be saved on disk. webpages_dirpath = os.path.join(self.cache_dirpath, domain) webpage_filepath = os.path.join(webpages_dirpath, os.path.basename(url)) else: # No cache used. Thus, the webpages will not be saved on disk. webpages_dirpath = "" webpage_filepath = "" # Check if the webpage associated with the URL is already cached if os.path.isfile(webpage_filepath): # NOTE: if None is given to os.path.isfile(), it complains with this # error: # TypeError: stat: path should be string, bytes, os.PathLike or integer, # not NoneType logger.info("The webpage {} was found in cache @ " "'{}'".format(url, webpage_filepath)) else: # The given webpage is brand new! logger.info("We will retrieve the webpage {}".format(url)) # Check if the URL is available # NOTE: it can also be done with requests which is not # installed by default on Python. logger.debug("Checking if the URL {} is available".format(url)) code = urlopen(url).getcode() logger.debug("The URL {} is up. Status code: {}".format(url, code)) self.logger.debug("Validating the URL's domain") # Validate URL's domain if domain in self.valid_domains: logger.debug("The domain '{}' is valid".format(domain)) else: raise lyrics_scraping.exceptions.InvalidURLDomainError( "The URL's domain '{}' is invalid. Only URLs from" " {} are accepted.".format(domain, self.valid_domains)) # Create directory for caching the webpage if self.cache_dirpath: try: create_dir(webpages_dirpath) except FileExistsError as e: logger.warning(e) return webpage_filepath def _scrape_webpage(self, url, webpage_filepath): """Scrape a given webpage and save the scraped data. It crawls the webpage and scrapes any useful info to be saved, such as the song's title and the lyrics text. The scraped data is saved in the :data:`~LyricsScraper.scraped_data` dictionary and in a database (if it was initially configured). If the cache is used, the webpage HTML is also save on disk to reduce the number of requests to the server. Parameters ---------- url: str The URL of the webpage to be scraped. webpage_filepath : str The path of the webpage where its HTML will be saved if the cache is used. """ raise NotImplementedError("The _scrape_webpage() method needs to be" " implemented by the derived classes of" " LyricsScraper.") def _save_album(self, album_title, artist_name, year): """Save the scraped data about an album. The data will be saved in the `scraped_data` dictionary and a database if it was initially configured. Parameters ---------- album_title : str The title of the album. artist_name : str The name of the artist. year : str The year the album was published. Notes ----- If the album title and artist name are missing (i.e. empty strings), the album data will not be saved. """ album_tuple = (album_title, artist_name, year,) logger.debug("Saving the album {}".format(album_tuple)) # Save album only if album and artist name are not missing if not self._count_empty_items(album_tuple[0:2]): if self.db_conn: # Save data into db self._insert_album(album_tuple) # Save data into dict self._update_scraped_data( album_tuple, self.scraped_data['albums']['data']) else: logger.warning("Album couldn't be saved!") def _save_artist(self, artist_name): """Save the scraped data about an artist. The data will be saved in the :data:`~LyricsScraper.scraped_data` dictionary and a database if it was initially configured. Parameters ---------- artist_name : str The name of the artist. Notes ----- If the artist name is missing (i.e. empty string), the artist data will not be saved. """ artist_tuple = (artist_name,) logger.debug("Saving the artist {}".format(artist_tuple)) # Save album only if artist name is not missing if not self._count_empty_items(artist_tuple): if self.db_conn: # Save data into db self._insert_artist(artist_tuple) # Save data into dict self._update_scraped_data( artist_tuple, self.scraped_data['artists']['data']) else: logger.warning("Artist couldn't be saved!") def _save_song(self, song_title, artist_name, album_title, lyrics_url, lyrics, year): """Save the scraped data about a song. The data will be saved in the :data:`~LyricsScraper.scraped_data` dictionary and a database if it was initially configured. Parameters ---------- song_title : str The title of the song artist_name : str The name of the artist. album_title : str The title of the album. lyrics_url : str The URL to the lyrics webpage where the scraped data comes from. lyrics : str The text lyrics. year : str The year the song was published. Notes ----- If the song title is missing (i.e. empty string), the song data will not be saved. """ song_tuple = (song_title, artist_name, album_title, lyrics_url, lyrics, year) logger.debug("Saving the song {}".format(song_tuple)) # Save album only if the song title is not missing if not self._count_empty_items(song_tuple[0:1]): if self.db_conn: # Save data into db self._insert_song(song_tuple) # Save data into dict self._update_scraped_data( song_tuple, self.scraped_data['songs']['data']) else: logger.warning("Song couldn't be saved!") def _update_scraped_data(self, data_tuple, scraped_data): """Update scraped data. Update the list of scraped by adding the tuple of data. The tuple of data must be **unique** in order to be added to the list of scraped data. Parameters ---------- data_tuple : tuple The tuple of data to be added to the list of scraped data. scraped_data : list The list of scraped data where the tuple of data will be added. """ # Check if tuple of data is unique if data_tuple in scraped_data: # Tuple of data is not unique logger.debug("Scraped data already previously saved: " "{}".format(data_tuple)) else: # Tuple of data is unique. Thus, save it. scraped_data.append(data_tuple) logger.debug("Scraped data successfully saved: " "{}".format(data_tuple)) def _execute_sql(self, sql, values): """Execute an SQL expression. The SQL expression can be a SELECT or an INSERT query. If it is a SELECT query, a list of tuple is returned. If it is an INSERT query, then the id of the last row in the updated table is returned. Parameters ---------- sql : str SQL query to be executed. values : tuple of str The values associated with the query (e.g. the values to be inserted if it is an INSERT query). Returns ------- cur.fetchall() : list of tuple List of tuple from the executed SELECT query, where each tuple represents one row entry. .. important:: This returned value only happens with **SELECT** queries. None Returned if the table couldn't be updated because of an :exc:`sqlite3.IntegrityError` exception. .. important:: This returned value only happens with **INSERT** queries. It is not a fatal exception that should stop the program execution since the exception can occur when the data to be inserted is already in the database which is a common case (e.g. if we add songs from the same artist, the artist name wilL only be added once). If this case happens, we add the rest of the scraped data. lastrowid : int The id of the last row in the updated table, after the insertion was successful. .. important:: This returned value only happens with **INSERT** queries. Raises ------ SQLSanityCheckError Raised if a sanity check on the SQL query failed, e.g. the query's values are not of :obj:`tuple` type or wrong number of values in the SQL query. Notes ----- ``values`` is needed only in the case of an INSERT query since we are inserting data into the db, unlike a SELECT query which only retrieve data from the db. When executing an INSERT query, the returned value (i.e. ``lastrowid``) is not used within the corresponding INSERT method, e.g. :meth:`~LyricsScraper._insert_album`. Check this `guide`_ for more information about SQLite database operations. .. important:: See the structure of the music database as defined in the `music.sql schema`_. """ cur = self.db_conn.cursor() try: sql_sanity_checks(sql, values) cur.execute(sql, values) except sqlite3.IntegrityError as e: # Duplicate data can't be inserted logger.debug(e) return None except pyutils.exceptions.SQLSanityCheckError as e: # One of the SQL sanity checks failed logger.error(e) raise else: # Successful SQL expression execution if sql.lower().startswith("select"): # SELECT query return cur.fetchall() else: # INSERT query if not self.autocommit: # Since autocommit is disabled, we must manually commit # all pending changes to the database self.db_conn.commit() logger.debug("Query execution successful! " "lastrowid={}".format(cur.lastrowid)) return cur.lastrowid def _insert_album(self, album): """Insert data about an album in the database. Data about an album can consist in the album' title and the year the album was published. See the `albums` table as defined in the `music.sql schema`_. Parameters ---------- album : tuple of str The tuple contains the relevant data about an album that will be added to the database, such as the album title, the artist name, and the year the album was published. """ sql = "INSERT INTO albums (album_title, artist_name, year)" \ " VALUES (?, ?, ?)" self._execute_sql(sql, album) def _insert_artist(self, artist_name): """Insert an artist's name in the database. An artist's name can refer to a group or an individual (solo). See the `artists` table as defined in the `music.sql schema`_. Parameters ---------- artist_name : tuple of str The tuple contains the name of the artist that will be added to the database. """ sql = "INSERT INTO artists (artist_name) VALUES (?)" self._execute_sql(sql, artist_name) def _insert_song(self, song): """Insert data about a song in the database. The data about a song that will be added to the database can consist to the song title, artist name, and album title. See the `songs` table as defined in the `music.sql schema`_. Parameters ---------- song : tuple of str The tuple contains the relevant data about a song that will be added to the database, such as the song title, the artist name, and the lyrics text. """ sql = "INSERT INTO songs (song_title, artist_name, album_title," \ " lyrics_url, lyrics, year) VALUES (?, ?, ?, ?, ?, ?)" self._execute_sql(sql, song) def _select_song_from_url(self, lyrics_url): """Select a song from the database based on a song URL. The song URL is used as the WHERE condition to be used for retrieving the associated row from the database. See the `songs_urls` table as defined in the `music.sql schema`_. Parameters ---------- lyrics_url : str Lyrics URL to be used in the WHERE condition of the SELECT query. Returns ------- cur.fetchall() : list of tuple List of tuple from the executed SELECT query, where each tuple represents one row entry. """ logger.debug("Selecting the song where " "lyrics_url={}".format(lyrics_url)) sql = "SELECT * FROM songs_urls WHERE lyrics_url=?" return self._execute_sql(sql, (lyrics_url,)) def __enter__(self): return self def __exit__(self, type, value, traceback): # print("Exception has been handled") self.compute_cache.db_conn.close() return True
[docs]class Lyrics: """TODO: remove, to be replaced by Song """ def __init__(self, song_title, artist_name, album_title, lyrics_url, lyrics_text, year): self.song_title = song_title self.artist_name = artist_name self.album_title = album_title self.lyrics_url = lyrics_url self.lyrics_text = lyrics_text self.year = year
[docs]class Song: """TODO """ def __init__(self, song_title, artist_name, album_title, lyrics_url, lyrics_text, year): self.song_title = song_title self.artist_name = artist_name self.album_title = album_title self.lyrics_url = lyrics_url self.lyrics_text = lyrics_text self.year = year
[docs]class Album: """TODO """ def __init__(self, album_title, artist_name, album_url, year): self.artist_name = artist_name self.album_title = album_title self.album_url = album_url self.year = year
[docs] @staticmethod def check_album_year(year_result): """TODO Parameters ---------- year_result : list TODO """ # TODO: explain if len(year_result) != 1: raise lyrics_scraping.exceptions.NonUniqueAlbumYearError( "The album year extraction doesn't result in a UNIQUE number") elif not (len(year_result[0]) == 4 and year_result[0].isdecimal()): raise lyrics_scraping.exceptions.WrongAlbumYearError( "The Album year extraction scheme broke: the year '{}' is not a " "number with four digits".format(year_result[0]))
[docs]class Artist: """TODO """ def __init__(self, song_title, artist_name, artist_url): self.song_title = song_title self.artist_name = artist_name self.artist_url = artist_url
[docs]class ComputeCache: """TODO """ def __init__(self, schema_filepath, ram_size): self.schema_filepath = schema_filepath self.db_conn = self._setup_db() self.ram_size = ram_size def _setup_db(self): """TODO Returns ------- db_conn TODO """ db_conn = connect_db(':memory:') # db_conn = sqlite3.connect(':memory:') logger.debug("<color>Executing schema for db ':memory:' ...</color>") with open(self.schema_filepath, 'rt') as f: schema = f.read() db_conn.executescript(schema) # cur = db_conn.cursor() # cur.executescript(schema) return db_conn