Source code for pyutil.lazy_downloader

#!/usr/bin/env python
# Maintainer: Faris Chugthai
"""Automates downloading plain text files from the Web.

===============
Lazy Downloader
===============

As implemented currently, it will only correctly handle plain text;
however, there are plans to implement the :mod:`mimetype` module and
properly handle a much wider range of files.

Both parameters, `url` and `output_fname` are required parameters.


Safety Features
---------------

If the filename already exists on the system it will NOT be overwritten,
and the script will safely exit.


Setting User Options
--------------------

This module is a perfect candidate for :class:`collections.ChainMap()`.
We could check env vars, config files, command line args and user
provided parameters and rank them in that order of importance when
configuring the download.


Attributes
----------
url : str
    A url to download
output_fname : str, optional
    A path to write the downloaded content to. Defaults to the last
    section of the URL when split by forward slashes, or :kbd:`/`.


"""
import argparse
from contextlib import closing
import logging
import os
import re
from urllib.parse import urlparse

import requests

from pyutil.__about__ import __version__

logger = logging.getLogger(__name__)


[docs]def _parse_arguments():
    """Parse user input."""
    parser = argparse.ArgumentParser(prog='__name__', description=__doc__)

    parser.add_argument("URL",
                        nargs=1,
                        type=str,
                        metavar="URL",
                        help="The URL to download. Must be plaintext.")

    parser.add_argument(
        "fname",
        metavar="Output filename",
        help="The name of the file to write to. Must not exist already.")

    parser.add_argument("-ha",
                        "--headers",
                        metavar="headers",
                        nargs='?',
                        type=dict,
                        help="Headers to send to the web server.")

    parser.add_argument('-V',
                        '--version',
                        action='version',
                        version='%(prog)s' + __version__)

    args = parser.parse_args()

    return args


[docs]def _parse_url(URL):
    """Parse the url in order to get something usable if we don't get a fname.

    If no output filename is given don't crash!

    Parameters
    ----------
    URL : str
        A live URL to download a page from

    Returns
    -------
    stripped_url : list
        A URL that's been split on the :kbd:`/` symbols.
    """
    stripped_url = urlparse(URL)['path']
    return stripped_url.split('/')[-1]


[docs]def _get_page(URL):
    """Get the content at `URL`.

    Returns content if it is recognized HTML/XML. If not, return `None`.
    """
    try:
        with closing(requests.get(URL, stream=True)) as res:
            if check_response(res):
                return res.content
            else:
                return None

    except requests.RequestException:
        # logger.something
        return None


[docs]def check_response(server_response):
    """Check that the headers sent by the server exist and are *200*."""
    content = server_response.headers['Content-Type'].lower()
    if server_response.status_code == 200 and content is not None:
        return True
    else:
        # logger
        return server_response.status_code


[docs]def _parse_site(URL, **kwargs):
    """Parse the given `URL`, remove tags and return plaintext.

    This should probably be modified to take the user agent and header args.

    Parameters
    ----------
    URL : str
        Page to download.

    Returns
    -------
    txt : str
        Plaintext view of the website.

    """
    res = requests.get(URL, kwargs)
    res.raise_for_status()

    txt = res.text()
    return txt


[docs]def find_links(text):
    """Search body of text for URLs.

    Parameters
    ----------
    text : str
        Body of formatted text to search for URLs.

    Returns
    -------
    links : todo
        URLs found on site.

    """
    links = re.findall('"((http|ftp)s?://.*?)"', text)
    return links


[docs]def main():
    """Download URL and write to disk."""
    args = _parse_arguments()
    # With xt permissions the script crashes so just bail
    try:
        fname = args.fname
    except Exception as e:
        print(e)

    if os.path.isfile(fname):
        raise FileExistsError
    # And if we're good, then bind the properties from the parser
    else:
        # should this be a try/except?
        url = args.URL

    std_headers = {
        'User-Agent':
        'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        'Accept-Language': 'en-us,en;q=0.5',
    }

    try:
        headers = args.headers
    except AttributeError:
        headers = std_headers

    # try:
    #     user_agent = args.user_agent
    # except Exception:
    #     user_agent = USER_AGENTS

    txt = _parse_site(url)

    with open(fname, "xt") as f:
        f.write(txt)


if __name__ == "__main__":
    main()
Source code for pyutil.lazy_downloader

pyutil

Related Topics

Global Table of Contents