Source code for pyutil.lazy_downloader

#!/usr/bin/env python
# Maintainer: Faris Chugthai
"""Automates downloading plain text files from the Web.

===============
Lazy Downloader
===============

As implemented currently, it will only correctly handle plain text;
however, there are plans to implement the :mod:`mimetype` module and
properly handle a much wider range of files.

Both parameters, `url` and `output_fname` are required parameters.


Safety Features
---------------

If the filename already exists on the system it will NOT be overwritten,
and the script will safely exit.


Setting User Options
--------------------

This module is a perfect candidate for :class:`collections.ChainMap()`.
We could check env vars, config files, command line args and user
provided parameters and rank them in that order of importance when
configuring the download.


Attributes
----------
url : str
    A url to download
output_fname : str, optional
    A path to write the downloaded content to. Defaults to the last
    section of the URL when split by forward slashes, or :kbd:`/`.


"""
import argparse
from contextlib import closing
import logging
import os
import re
from urllib.parse import urlparse

import requests

from pyutil.__about__ import __version__

logger = logging.getLogger(__name__)


[docs]def _parse_arguments(): """Parse user input.""" parser = argparse.ArgumentParser(prog='__name__', description=__doc__) parser.add_argument("URL", nargs=1, type=str, metavar="URL", help="The URL to download. Must be plaintext.") parser.add_argument( "fname", metavar="Output filename", help="The name of the file to write to. Must not exist already.") parser.add_argument("-ha", "--headers", metavar="headers", nargs='?', type=dict, help="Headers to send to the web server.") parser.add_argument('-V', '--version', action='version', version='%(prog)s' + __version__) args = parser.parse_args() return args
[docs]def _parse_url(URL): """Parse the url in order to get something usable if we don't get a fname. If no output filename is given don't crash! Parameters ---------- URL : str A live URL to download a page from Returns ------- stripped_url : list A URL that's been split on the :kbd:`/` symbols. """ stripped_url = urlparse(URL)['path'] return stripped_url.split('/')[-1]
[docs]def _get_page(URL): """Get the content at `URL`. Returns content if it is recognized HTML/XML. If not, return `None`. """ try: with closing(requests.get(URL, stream=True)) as res: if check_response(res): return res.content else: return None except requests.RequestException: # logger.something return None
[docs]def check_response(server_response): """Check that the headers sent by the server exist and are *200*.""" content = server_response.headers['Content-Type'].lower() if server_response.status_code == 200 and content is not None: return True else: # logger return server_response.status_code
[docs]def _parse_site(URL, **kwargs): """Parse the given `URL`, remove tags and return plaintext. This should probably be modified to take the user agent and header args. Parameters ---------- URL : str Page to download. Returns ------- txt : str Plaintext view of the website. """ res = requests.get(URL, kwargs) res.raise_for_status() txt = res.text() return txt
[docs]def main(): """Download URL and write to disk.""" args = _parse_arguments() # With xt permissions the script crashes so just bail try: fname = args.fname except Exception as e: print(e) if os.path.isfile(fname): raise FileExistsError # And if we're good, then bind the properties from the parser else: # should this be a try/except? url = args.URL std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } try: headers = args.headers except AttributeError: headers = std_headers # try: # user_agent = args.user_agent # except Exception: # user_agent = USER_AGENTS txt = _parse_site(url) with open(fname, "xt") as f: f.write(txt)
if __name__ == "__main__": main()