"""Module that defines the base class for scraping lyrics websites and saving
the scraped content.
More specifically, the derived classes (e.g.
:class:`~scrapers.azlyrics_scraper.AZLyricsScraper`) are the ones that do the
actual scraping of the lyrics webpages.
By default, the scraped data is saved in a dictionary (see the variable
:data:`~LyricsScraper.scraped_data`).
The scraped data can also be saved in a database if a path to the SQLite
database is given via the argument :ref:`db_filepath
<LyricsScraperParametersLabel>`.
See the structure of the music database as defined in the `music.sql schema`_.
.. _guide: https://bit.ly/2xYreie
.. _HTTP GET request: https://www.webopedia.com/TERM/H/HTTP_request_header.html
.. _music.sql schema: https://bit.ly/2kIMYvn
.. _saveutils.py: https://bit.ly/2m5z46A
.. _saveutils.SaveWebpages: https://bit.ly/2oaz7Px
.. _scraper.py: https://bit.ly/2msZDTC
.. _use a specialized library: https://stackoverflow.com/a/56476496
.. _YAML logging file: https://bit.ly/2m5wjSM
"""
import logging
import os
import random
import sqlite3
# NOTE:
# For urllib with Python 2, it is
# from six.moves.urllib.parse import urlparse
import urllib
from logging import NullHandler
from urllib.request import urlopen
from urllib.parse import urlparse
import lyrics_scraping.exceptions
import pyutils.exceptions
from lyrics_scraping.utils import plural, get_data_filepath
from pyutils.dbutils import connect_db, create_db, sql_sanity_checks
from pyutils.genutils import create_dir
from pyutils.logutils import get_error_msg, setup_logging_from_cfg
from pyutils.webcache import WebCache
logger = logging.getLogger(__name__)
logger.addHandler(NullHandler())
_SETUP_LOGGING = True
[docs]class LyricsScraper:
"""Base class for scraping and saving webpages locally.
This class is responsible for doing lots of configuration before the web
scraping starts, such as setting up logging and the database.
The actual scraping of the lyrics websites is done by the derived classes
(e.g. :class:`~scrapers.azlyrics_scraper.AZLyricsScraper`) since each lyrics
websites have their own way of being crawled (they are all designed
differently). However, the base class is responsible for saving the scraped
data in a dictionary (:data:`~LyricsScraper.scraped_data`) and in a database
(if it was initially make configured).
.. _LyricsScraperParametersLabel:
Parameters
----------
lyrics_urls : list [str]
List of URLs to lyrics webpages which will be scraped.
db_filepath : str, optional
File path to the SQLite music database (the default value is :obj:`None`
which implies that no database will be used. The scraped data will be
saved only in the :data:`~LyricsScraper.scraped_data` dictionary).
autocommit : bool, optional
Whether the changes to the database are committed right away (the
default is False which implies that the changes won't take effect
immediately).
overwrite_db : bool, optional
Whether the database will be overwritten. The user is given some time
to stop the script before the database is overwritten (the default
value is False).
update_tables : bool, optional
Whether the tables in the database can be updated (the default value is
False).
cache_dirpath : str, optional
Path to the cache directory where webpages are saved (the default value
is :obj:`None` which implies that the cache will not be used).
overwrite_webpages : bool, optional
Whether the webpages saved in cache can be overwritten (the default value
is False).
http_get_timeout : int, optional
Timeout when a GET request doesn't receive any response from the server.
After the timeout expires, the GET request is dropped (the default
value is 5 seconds).
delay_between_requests : int, optional
A delay will be added between HTTP requests in order to reduce the
workload on the server (the default value is 8 seconds which implies
that there will be a delay of 8 seconds between successive HTTP
requests).
headers : dict, optional
The information added to the `HTTP GET request`_ that a user's browser
sends to a Web server containing the details of what the browser wants
and will accept back from the server. (the default value is defined
in :obj:`saveutils.SaveWebpages.headers`).
use_logging : bool, optional
Whether to log messages on console and file. The logging is setup
according to the `YAML logging file`_ (the default value is False which
implies that no logging will be used and thus no messages will be
printed on the console).
**kwargs : dict
TODO
Attributes
----------
skipped_urls : dict [str, str]
Stores the URLs that were skipped because of an error such as
:exc:`OSError` or :exc:`~exceptions.connection.HTTP404Error`,
along with the error message. The keys are the URLs and the values are
the associated error messages.
good_urls : set
Stores the unique URLs that were successfully processed and saved.
checked_urls : set
Stores the unique URLs that were processed (whether successfully or
unsuccessfully) during the current session. Thus, `checked_urls` should
equal to `skipped_urls` + `good_urls`.
db_conn : sqlite3.Connection
SQLite database connection.
saver : :class:`saveutils.SaveWebpages`
For retrieving webpages and saving them in cache. See :mod:`saveutils`.
valid_domains : list
Only URLs from these domains will be processed.
logging_filepath : str
Path to the `YAML logging file`_ which is used to setup logging for all
custom modules.
schema_filepath : str
Path to `music.sql schema`_ for building the music database which will
store the scraped data.
scraped_data : dict
The scraped data is saved as a dictionary. Its structure is based on
the database's `music.sql schema`_.
Notes
-----
If the corresponding flags are activated, logging and database are setup in
:meth:`__init__`.
By default, the scraped data is saved in a dictionary whose structure is
described below (see :data:`~LyricsScraper.scraped_data`). The scraped data
will also be saved if a database is given via :ref:`db_filepath
<LyricsScraperParametersLabel>`.
See the structure of the music database as defined in the `music.sql
schema`_.
The scraped webpages can also be cached in order to reduce the number of
HTTP requests to the server (See :ref:`db_filepath
<LyricsScraperParametersLabel>`).
"""
valid_domains = ["www.azlyrics.com"]
scraped_data = {
'albums': {
'headers': ('album_title', 'artist_name', 'year',),
'data': []
},
'artists': {
'headers': ('artist_name',),
'data': []
},
'songs': {
'headers': ('song_title', 'artist_name', 'album_title',
'lyrics_url', 'lyrics', 'year',),
'data': []
}
}
"""The scraped data is saved as a dictionary.
.. _scraped-data-Label:
Its keys and values are defined as follow:
.. code:: python
scraped_data = {
'albums': {
'headers': ('album_title', 'artist_name', 'year',),
'data': []
},
'artists': {
'headers': ('artist_name',),
'data': []
},
'songs': {
'headers': ('song_title', 'artist_name', 'album_title',
'lyrics_url', 'lyrics', 'year',),
'data': []
}
}
.. note:: The 'data' key points to a list of tuple that eventually will
store the scraped data from different URLs, i.e. each scraped data from
a given URL is added as a tuple to the list.
"""
# TODO: add example of data.
def __init__(self, db_filepath="", overwrite_db=False,
use_webcache=True, webcache_dirpath="~/.cache/lyric_scraping/",
expire_after=25920000, use_compute_cache=True, ram_size=100,
http_get_timeout=5, delay_between_requests=8,
headers=WebCache.HEADERS, seed=123456, interactive=False,
delay_interactive=30, best_match=False, simulate=False,
ignore_errors=False):
self.skipped_urls = {}
self.good_urls = set()
self.checked_urls = set()
# TODO: AssertionError are raised in both lines
self.logging_cfg_filepath = get_data_filepath(file_type='log')
self.schema_filepath = get_data_filepath(file_type='schema')
# ==============
# Logging config
# ==============
if _SETUP_LOGGING:
# Setup logging for all custom modules based on the default logging
# config file
logger.debug("<color>Setting up logging ...</color>")
setup_logging_from_cfg(self.logging_cfg_filepath)
logger.info("<color>Logging is setup</color>")
else:
logger.warning("<color>Logging was already setup</color>")
# ===============
# Database config
# ===============
self.overwrite_db = overwrite_db
self.db_filepath = os.path.expanduser(db_filepath)
# TODO: remove db_conn from everywhere
self.db_conn = None
if self.db_filepath:
logger.debug("<color>Setting up the music database ...</color>")
# Create music db if necessary
# TODO: IOError and sqlite3.OperationalError are raised
create_db(self.db_filepath,
self.schema_filepath,
self.overwrite_db)
logger.info("<color>Music database is setup</color>")
else:
# No database to fbe used
logger.debug("<color>No music database used</color>")
# ================
# Web cache config
# ================
self.webcache_dirpath = os.path.expanduser(webcache_dirpath)
self.cache_name = os.path.join(self.webcache_dirpath, "cache")
self.use_webcache = use_webcache
self.expire_after = expire_after
self.http_get_timeout = http_get_timeout
self.delay_between_requests = delay_between_requests
self.headers = headers
if self.use_webcache:
logger.debug("<color>Setting up web-cache ...</color>")
logger.debug("<color>Creating the web-cache directory: "
"{}</color>".format(self.webcache_dirpath))
try:
# TODO: FileExistsError and PermissionError are raised
create_dir(self.webcache_dirpath, overwrite=False)
except FileExistsError as e:
logger.debug("<color>{}</color>".format(e))
logger.debug("<color>The webcache directory already exists: "
"{}</color>".format(self.webcache_dirpath))
self.webcache = WebCache(
cache_name=self.cache_name,
expire_after=self.expire_after,
http_get_timeout=self.http_get_timeout,
delay_between_requests=self.delay_between_requests,
headers=self.headers)
logger.info("<color>web-cache is setup</color>")
else:
self.webcache = None
logger.debug("<color>No web-cache used</color>")
# ====================
# Compute cache config
# ====================
self.use_compute_cache = use_compute_cache
self.ram_size = ram_size
if self.use_compute_cache:
logger.debug("<color>Setting up compute-cache ...</color>")
self.compute_cache = ComputeCache(self.schema_filepath,
self.ram_size)
logger.info("<color>compute-cache is setup</color>")
else:
self.compute_cache = None
logger.debug("<color>No compute-cache used</color>")
# ==============
# Scraper config
# ==============
self.seed = seed
random.seed(self.seed)
logger.info("<color>Random number generator initialized with seed={}"
"</color>".format(self.seed))
self.interactive = interactive
self.delay_interactive = delay_interactive
self.best_match = best_match
self.simulate = simulate
self.ignore_errors = ignore_errors
self.min_year = 1000
[docs] def get_song_lyrics(self, song_title, artist_name=None):
"""TODO
Parameters
----------
song_title
artist_name
Returns
-------
"""
# TODO: add message
raise NotImplementedError("")
[docs] def get_lyrics_from_album(self, album_title, artist_name=None,
max_songs=None):
"""TODO
Parameters
----------
album_title
artist_name
max_songs
Returns
-------
"""
# TODO: add message
raise NotImplementedError("")
[docs] def get_lyrics_from_artist(self, artist_name, max_songs=None,
year_after=None, year_before=None):
"""TODO
Parameters
----------
artist_name
max_songs
year_after
year_before
Returns
-------
"""
# TODO: add message
raise NotImplementedError("")
[docs] def search_song_lyrics(self, song_title, artist_name=None):
"""TODO
Parameters
----------
song_title
artist_name
Returns
-------
"""
# TODO: add message
raise NotImplementedError("")
[docs] def search_album(self, album_title, artist_name=None):
"""TODO
Parameters
----------
album_title
artist_name
Returns
-------
"""
# TODO: add message
raise NotImplementedError("")
[docs] def search_artist(self, artist_name=None):
"""TODO
Parameters
----------
artist_name
Returns
-------
"""
# TODO: add message
raise NotImplementedError("")
[docs] def start_scraping(self):
"""Start the web scraping of lyrics websites.
This method iterates through each lyrics URL from the main config file
and delegates the important tasks (URL processing and scraping) to
separate methods (:meth:`_process_url` and :meth:`_scrape_webpage`).
Notes
-----
This method catches all exceptions that prevent a given URL of being
processed further, e.g. the webpage is not found (404 Error) or the URL
is not from a valid domain.
Any exception that is not caught here is redirected to the main script
calling this method. See for example the main script
:mod:`scripts.scraper`.
"""
# Process list of URLs to lyrics websites
for url in self.lyrics_urls:
skip_url = True
error = None
try:
webpage_filename = self._process_url(url)
self._scrape_webpage(url, webpage_filename)
except OSError as e:
logger.exception(e)
error = e
except urllib.error.URLError as e:
logger.exception(e)
logger.warning("The URL {} seems to be down!".format(url))
error = e
except (FileExistsError,
lyrics_scraping.exceptions.CurrentSessionURLError,
lyrics_scraping.exceptions.InvalidURLDomainError,
lyrics_scraping.exceptions.InvalidURLCategoryError,
lyrics_scraping.exceptions.MultipleLyricsURLError,
lyrics_scraping.exceptions.OverwriteSongError,
pyutils.exceptions.HTTP404Error,
pyutils.exceptions.SQLSanityCheckError) as e:
logger.error(e)
error = e
else:
skip_url = False
finally:
# Close db connection
self.db_conn.close()
# Add the URL as skipped or good
if skip_url:
self._add_skipped_url(url, get_error_msg(error))
else:
logger.debug("URL successfully processed: "
"{}".format(url))
self.good_urls.add(url)
[docs] def get_scraped_data(self):
"""Return the scraped data as a dictionary.
This method returns all the data that was scraped from the lyrics
webpages. If a database was used, the scraped data is also saved in the
SQLite database file found at :ref:`db_filepath
<LyricsScraperParametersLabel>`
See :ref:`scraped_data <scraped-data-Label>` for a detailed structure of
the returned dictionary.
Returns
-------
scraped_data : dict
The scraped data whose content is described in
:data:`~scrapers.lyrics_scraper.LyricsScraper.scraped_data`.
"""
# If a db was used, inform the user that the scraped data is also to be
# found in the SQLite database that was initially configured.
if self.db_conn:
logger.info("The scraped data is also saved in the database "
"'{}'".format(self.db_filepath))
return self.scraped_data
def _add_skipped_url(self, url, error):
"""Add an URL as skipped.
The skipped URL is added to a dictionary along with its error message
which explains why it was skipped.
Parameters
----------
url : str
The skipped URL which will be added to the dictionary along with its
corresponding error message.
exc : Exception
The error message as an :exc:`Exception`, e.g. :exc:`TypeError`, which
will be converted to a string and added to the dictionary along with
its corresponding URL.
"""
logger.warning("Skipping the URL {}".format(url))
self.skipped_urls.setdefault(url, [])
self.skipped_urls[url].append(str(error))
def _url_already_processed(self, url):
"""Check if an URL was already processed.
First, the URL is checked if it was already processed during the current
session.
Then, the URL is checked if is already present in the database (if a
database is used).
By doing these checks, we reduce a lot of computations that would have
been unnecessary, like scraping and saving an already processed webpage.
Parameters
----------
url : str
The URL to be checked if it was previously processed.
Returns
-------
TODO
"""
retcode = 1
# First, check if the URL was already processed during current session
if url in self.checked_urls:
logger.warning("The URL was already processed during this "
"session: {}".format(url))
elif self.db_filepath and self._url_in_db(url) == 2:
# The URL was found in the db
retcode = 2
else:
# URL is brand new! Thus, it can be further processed.
retcode = 0
logger.debug("The URL was not previously processed: {}".format(url))
self.checked_urls.add(url)
return retcode
def _url_in_db(self, url):
"""Check if an URL is already present in the database.
When processing a given artist or lyrics URL, we check if it is
already in the db. Hence, we speed up the program execution by not
processing the same URL again.
However, if the option `overwrite_db` is set to True, then the URL
will be processed again.
Parameters
----------
url : str
URL to be checked if it is already in the db.
Returns
-------
TODO
Raises
------
MultipleLyricsURLError
Raised if an URL was found more than once in the music db.
"""
retcode = 1
# Select all songs with the given URL from the music db
res = self._select_song_from_url(url)
if len(res) == 1:
# Only one song found with the given URL
logger.debug("There is already a song with the same URL: "
"{}".format(url))
# Check if the song found with the given URL can be updated
# (overwritten) in the table
if self.overwrite_db:
retcode = 2
# Song can be updated
logger.debug("Since the 'overwrite_db' flag is set to True, "
"the URL will be processed and the music db will "
"be updated as a consequence")
else:
# Song can't be updated
# TODO: it should be a warning
logger.debug("Since the 'overwrite_db' flag is set to False, "
"the URL will be ignored")
elif len(res) == 0:
# No song found with the given URL
retcode = 0
logger.debug("The song URL was not found in the music "
"db: {}".format(url))
else:
# Odd case: more than one song was found with the given URL
raise lyrics_scraping.exceptions.MultipleLyricsURLError(
"The song URL was found more than once in the music "
"db: {}".format(url))
return retcode
@staticmethod
def _count_empty_items(data):
"""Count empty items in a tuple.
Returns the number of empty items in a list or tuple which can be empty
strings or :obj:`None`.
Parameters
----------
data : list or tuple
The tuple whose content will be checked for number of empty items.
Returns
-------
count : int
The number of empty items (empty strings or :obj:`None`) in the tuple.
Notes
-----
A warning that empty items are found is logged.
"""
# Count number of empty items in the list/tuple
count = sum([1 for f in data if not f])
if count:
# At least one empty item found
logger.warning("Empty field{}: {}".format(plural(count), data))
return count
def _process_url(self, url):
"""Process each URL defined in the YAML config file.
The URLs can refer to an artist or lyrics webpage. In order to reduce
the number of HTTP requests to the lyrics website, the URL is first
checked if it has already been processed.
Parameters
----------
url : str
URL to the artist's or lyrics webpage that will be scraped.
Returns
-------
webpage_filepath : str
File path where the webpage's HTML will be cached.
Raises
------
InvalidURLDomainError
Raised if the URL is not from a valid domain. See
:data:`~LyricsScraper.valid_domains`.
Notes
-----
There is a more robust parsing of the top-level domain: `use a
specialized library`_ (e.g. tldextract). For example, `urlparse`
will not be able to extract the right domain from a more complex URL
such as 'http://forums.news.cnn.com/'. On the other hand, `tldextract`
will output 'cnn' which is correct.
"""
logger.info("Processing the URL {}".format(url))
# Check first if the URL was already processed, e.g. is found in the db
self._check_url_if_processed(url)
domain = urlparse(url).netloc
# Get the name of the directory in cache where the webpages are/will be
# saved
if self.cache_dirpath:
# Cache to be used. Webpages will be saved on disk.
webpages_dirpath = os.path.join(self.cache_dirpath, domain)
webpage_filepath = os.path.join(webpages_dirpath,
os.path.basename(url))
else:
# No cache used. Thus, the webpages will not be saved on disk.
webpages_dirpath = ""
webpage_filepath = ""
# Check if the webpage associated with the URL is already cached
if os.path.isfile(webpage_filepath):
# NOTE: if None is given to os.path.isfile(), it complains with this
# error:
# TypeError: stat: path should be string, bytes, os.PathLike or integer,
# not NoneType
logger.info("The webpage {} was found in cache @ "
"'{}'".format(url, webpage_filepath))
else:
# The given webpage is brand new!
logger.info("We will retrieve the webpage {}".format(url))
# Check if the URL is available
# NOTE: it can also be done with requests which is not
# installed by default on Python.
logger.debug("Checking if the URL {} is available".format(url))
code = urlopen(url).getcode()
logger.debug("The URL {} is up. Status code: {}".format(url, code))
self.logger.debug("Validating the URL's domain")
# Validate URL's domain
if domain in self.valid_domains:
logger.debug("The domain '{}' is valid".format(domain))
else:
raise lyrics_scraping.exceptions.InvalidURLDomainError(
"The URL's domain '{}' is invalid. Only URLs from"
" {} are accepted.".format(domain, self.valid_domains))
# Create directory for caching the webpage
if self.cache_dirpath:
try:
create_dir(webpages_dirpath)
except FileExistsError as e:
logger.warning(e)
return webpage_filepath
def _scrape_webpage(self, url, webpage_filepath):
"""Scrape a given webpage and save the scraped data.
It crawls the webpage and scrapes any useful info to be saved, such as
the song's title and the lyrics text.
The scraped data is saved in the :data:`~LyricsScraper.scraped_data`
dictionary and in a database (if it was initially configured).
If the cache is used, the webpage HTML is also save on disk to reduce
the number of requests to the server.
Parameters
----------
url: str
The URL of the webpage to be scraped.
webpage_filepath : str
The path of the webpage where its HTML will be saved if the cache
is used.
"""
raise NotImplementedError("The _scrape_webpage() method needs to be"
" implemented by the derived classes of"
" LyricsScraper.")
def _save_album(self, album_title, artist_name, year):
"""Save the scraped data about an album.
The data will be saved in the `scraped_data` dictionary and a database
if it was initially configured.
Parameters
----------
album_title : str
The title of the album.
artist_name : str
The name of the artist.
year : str
The year the album was published.
Notes
-----
If the album title and artist name are missing (i.e. empty strings), the
album data will not be saved.
"""
album_tuple = (album_title, artist_name, year,)
logger.debug("Saving the album {}".format(album_tuple))
# Save album only if album and artist name are not missing
if not self._count_empty_items(album_tuple[0:2]):
if self.db_conn:
# Save data into db
self._insert_album(album_tuple)
# Save data into dict
self._update_scraped_data(
album_tuple, self.scraped_data['albums']['data'])
else:
logger.warning("Album couldn't be saved!")
def _save_artist(self, artist_name):
"""Save the scraped data about an artist.
The data will be saved in the :data:`~LyricsScraper.scraped_data`
dictionary and a database if it was initially configured.
Parameters
----------
artist_name : str
The name of the artist.
Notes
-----
If the artist name is missing (i.e. empty string), the artist data will
not be saved.
"""
artist_tuple = (artist_name,)
logger.debug("Saving the artist {}".format(artist_tuple))
# Save album only if artist name is not missing
if not self._count_empty_items(artist_tuple):
if self.db_conn:
# Save data into db
self._insert_artist(artist_tuple)
# Save data into dict
self._update_scraped_data(
artist_tuple, self.scraped_data['artists']['data'])
else:
logger.warning("Artist couldn't be saved!")
def _save_song(self, song_title, artist_name, album_title, lyrics_url,
lyrics, year):
"""Save the scraped data about a song.
The data will be saved in the :data:`~LyricsScraper.scraped_data`
dictionary and a database if it was initially configured.
Parameters
----------
song_title : str
The title of the song
artist_name : str
The name of the artist.
album_title : str
The title of the album.
lyrics_url : str
The URL to the lyrics webpage where the scraped data comes from.
lyrics : str
The text lyrics.
year : str
The year the song was published.
Notes
-----
If the song title is missing (i.e. empty string), the song data will
not be saved.
"""
song_tuple = (song_title, artist_name, album_title, lyrics_url, lyrics,
year)
logger.debug("Saving the song {}".format(song_tuple))
# Save album only if the song title is not missing
if not self._count_empty_items(song_tuple[0:1]):
if self.db_conn:
# Save data into db
self._insert_song(song_tuple)
# Save data into dict
self._update_scraped_data(
song_tuple, self.scraped_data['songs']['data'])
else:
logger.warning("Song couldn't be saved!")
def _update_scraped_data(self, data_tuple, scraped_data):
"""Update scraped data.
Update the list of scraped by adding the tuple of data.
The tuple of data must be **unique** in order to be added to the list
of scraped data.
Parameters
----------
data_tuple : tuple
The tuple of data to be added to the list of scraped data.
scraped_data : list
The list of scraped data where the tuple of data will be added.
"""
# Check if tuple of data is unique
if data_tuple in scraped_data:
# Tuple of data is not unique
logger.debug("Scraped data already previously saved: "
"{}".format(data_tuple))
else:
# Tuple of data is unique. Thus, save it.
scraped_data.append(data_tuple)
logger.debug("Scraped data successfully saved: "
"{}".format(data_tuple))
def _execute_sql(self, sql, values):
"""Execute an SQL expression.
The SQL expression can be a SELECT or an INSERT query.
If it is a SELECT query, a list of tuple is returned. If it is an
INSERT query, then the id of the last row in the updated table is
returned.
Parameters
----------
sql : str
SQL query to be executed.
values : tuple of str
The values associated with the query (e.g. the values to be
inserted if it is an INSERT query).
Returns
-------
cur.fetchall() : list of tuple
List of tuple from the executed SELECT query, where each tuple
represents one row entry.
.. important::
This returned value only happens with **SELECT** queries.
None
Returned if the table couldn't be updated because of an
:exc:`sqlite3.IntegrityError` exception.
.. important::
This returned value only happens with **INSERT** queries.
It is not a fatal exception that should stop the program execution
since the exception can occur when the data to be inserted is
already in the database which is a common case (e.g. if we add
songs from the same artist, the artist name wilL only be added
once). If this case happens, we add the rest of the scraped data.
lastrowid : int
The id of the last row in the updated table, after the insertion
was successful.
.. important::
This returned value only happens with **INSERT** queries.
Raises
------
SQLSanityCheckError
Raised if a sanity check on the SQL query failed, e.g. the
query's values are not of :obj:`tuple` type or wrong number of
values in the SQL query.
Notes
-----
``values`` is needed only in the case of an INSERT query since we are
inserting data into the db, unlike a SELECT query which only retrieve
data from the db.
When executing an INSERT query, the returned value (i.e. ``lastrowid``)
is not used within the corresponding INSERT method, e.g.
:meth:`~LyricsScraper._insert_album`.
Check this `guide`_ for more information about SQLite database
operations.
.. important::
See the structure of the music database as defined in the `music.sql
schema`_.
"""
cur = self.db_conn.cursor()
try:
sql_sanity_checks(sql, values)
cur.execute(sql, values)
except sqlite3.IntegrityError as e:
# Duplicate data can't be inserted
logger.debug(e)
return None
except pyutils.exceptions.SQLSanityCheckError as e:
# One of the SQL sanity checks failed
logger.error(e)
raise
else:
# Successful SQL expression execution
if sql.lower().startswith("select"):
# SELECT query
return cur.fetchall()
else:
# INSERT query
if not self.autocommit:
# Since autocommit is disabled, we must manually commit
# all pending changes to the database
self.db_conn.commit()
logger.debug("Query execution successful! "
"lastrowid={}".format(cur.lastrowid))
return cur.lastrowid
def _insert_album(self, album):
"""Insert data about an album in the database.
Data about an album can consist in the album' title and the year the
album was published.
See the `albums` table as defined in the `music.sql schema`_.
Parameters
----------
album : tuple of str
The tuple contains the relevant data about an album that will be
added to the database, such as the album title, the artist name, and
the year the album was published.
"""
sql = "INSERT INTO albums (album_title, artist_name, year)" \
" VALUES (?, ?, ?)"
self._execute_sql(sql, album)
def _insert_artist(self, artist_name):
"""Insert an artist's name in the database.
An artist's name can refer to a group or an individual (solo).
See the `artists` table as defined in the `music.sql schema`_.
Parameters
----------
artist_name : tuple of str
The tuple contains the name of the artist that will be added to the
database.
"""
sql = "INSERT INTO artists (artist_name) VALUES (?)"
self._execute_sql(sql, artist_name)
def _insert_song(self, song):
"""Insert data about a song in the database.
The data about a song that will be added to the database can consist to
the song title, artist name, and album title.
See the `songs` table as defined in the `music.sql schema`_.
Parameters
----------
song : tuple of str
The tuple contains the relevant data about a song that will be
added to the database, such as the song title, the artist name, and
the lyrics text.
"""
sql = "INSERT INTO songs (song_title, artist_name, album_title," \
" lyrics_url, lyrics, year) VALUES (?, ?, ?, ?, ?, ?)"
self._execute_sql(sql, song)
def _select_song_from_url(self, lyrics_url):
"""Select a song from the database based on a song URL.
The song URL is used as the WHERE condition to be used for retrieving
the associated row from the database.
See the `songs_urls` table as defined in the `music.sql schema`_.
Parameters
----------
lyrics_url : str
Lyrics URL to be used in the WHERE condition of the SELECT query.
Returns
-------
cur.fetchall() : list of tuple
List of tuple from the executed SELECT query, where each tuple
represents one row entry.
"""
logger.debug("Selecting the song where "
"lyrics_url={}".format(lyrics_url))
sql = "SELECT * FROM songs_urls WHERE lyrics_url=?"
return self._execute_sql(sql, (lyrics_url,))
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
# print("Exception has been handled")
self.compute_cache.db_conn.close()
return True
[docs]class Lyrics:
"""TODO: remove, to be replaced by Song
"""
def __init__(self, song_title, artist_name, album_title, lyrics_url,
lyrics_text, year):
self.song_title = song_title
self.artist_name = artist_name
self.album_title = album_title
self.lyrics_url = lyrics_url
self.lyrics_text = lyrics_text
self.year = year
[docs]class Song:
"""TODO
"""
def __init__(self, song_title, artist_name, album_title, lyrics_url,
lyrics_text, year):
self.song_title = song_title
self.artist_name = artist_name
self.album_title = album_title
self.lyrics_url = lyrics_url
self.lyrics_text = lyrics_text
self.year = year
[docs]class Album:
"""TODO
"""
def __init__(self, album_title, artist_name, album_url, year):
self.artist_name = artist_name
self.album_title = album_title
self.album_url = album_url
self.year = year
[docs] @staticmethod
def check_album_year(year_result):
"""TODO
Parameters
----------
year_result : list
TODO
"""
# TODO: explain
if len(year_result) != 1:
raise lyrics_scraping.exceptions.NonUniqueAlbumYearError(
"The album year extraction doesn't result in a UNIQUE number")
elif not (len(year_result[0]) == 4 and
year_result[0].isdecimal()):
raise lyrics_scraping.exceptions.WrongAlbumYearError(
"The Album year extraction scheme broke: the year '{}' is not a "
"number with four digits".format(year_result[0]))
[docs]class Artist:
"""TODO
"""
def __init__(self, song_title, artist_name, artist_url):
self.song_title = song_title
self.artist_name = artist_name
self.artist_url = artist_url
[docs]class ComputeCache:
"""TODO
"""
def __init__(self, schema_filepath, ram_size):
self.schema_filepath = schema_filepath
self.db_conn = self._setup_db()
self.ram_size = ram_size
def _setup_db(self):
"""TODO
Returns
-------
db_conn
TODO
"""
db_conn = connect_db(':memory:')
# db_conn = sqlite3.connect(':memory:')
logger.debug("<color>Executing schema for db ':memory:' ...</color>")
with open(self.schema_filepath, 'rt') as f:
schema = f.read()
db_conn.executescript(schema)
# cur = db_conn.cursor()
# cur.executescript(schema)
return db_conn