Source code for dhelp.web

#!/usr/bin/python

import time

from collections import UserString

import requests
from bs4 import BeautifulSoup


[docs]class WebPage(UserString):
    """Downloads and parses HTML into BeautifulSoup objects.

    Provides methods to download/parse a specified webpage. Merges the request
    package with BeautifulSoup functions to enable users to request/soup
    a page in a single line.

    Args:
        url (:obj:`str`) URL of page you wish to scrape
        options (:obj:`dict`, optional) Dictionary with keyword/value pairs to set options

    Examples:
        >>> from dhelp import WebPage
        >>> web_page = WebPage('https://stackoverflow.com')
        >>> print(web_page)
        'https://stackoverflow.com'
        >>> # pass an dict to set options for delay, max_retries, or silent
        >>> options = {
        ...     'delay': 4,
                'max_retries': 3,
                'silent': True
                'parser': 'html.parser'
        ... }
        >>> web_page = WebPage('https://stackoverflow.com', options=options)
        https://stackoverflow.com
    """ # noqa

    def __init__(self, url, options={}):
        # call parent constructor
        super().__init__(str)
        if type(url) is not str:
            raise Exception('URL must be a string')
        if 'delay' not in options:
            options['delay'] = 2
        if 'max_retries' not in options:
            options['max_retries'] = 0
        if 'silent' not in options:
            options['silent'] = False
        if 'parser' not in options:
            options['parser'] = 'html.parser'
        self.data = url
        self.options = options

    def __enter__(self):
        return self.soup()

    def __exit__(self, ctx_type, ctx_value, ctx_traceback):
        if not self.options['silent']:
            print('Successfully scraped', self.data)

[docs]    def fetch(self, retry_counter=0):
        """Returns http request from URL as a string.

        Can be called to return HTML data, although not generally meant to be
        called directly by user. If user calls .fetch(), retry_counter should
        not be passed so that it will start at 0. This function is intended
        to be called by .soup() in order to feed its parser.

        If the request was not successful, .fetch() calls itself recursively
        until it is either successful, or the maximum number of attempts has
        been reached. If the .max_retries property is set to 0, .fetch() will
        make inifinite requests.

        Args:
            retry_counter (:obj:`int`) The number of attempts already made to fetch the object.

        Returns:
            :obj:`str` HTML from requested URL, in plain text format

        Examples:
            >>> html_text = WebPage('https://stackoverflow.com/').fetch()
            <!DOCTYPE html>\\r\\n<html>\\r\\n\\r\\n    <head>\\r\\n\\r\\n        <title>Stack Overflow...
        """ # noqa
        # print message unless silent option
        if not self.options['silent']:
            print('Fetching', self.data)
        # enforce delay to reduce server load
        time.sleep(self.options['delay'])
        # attempt to fetch web page
        try:
            request = requests.get(self.data)
        # if error in getting page, call self recursively to try again
        except Exception:
            if not self.options['silent']:
                print('Problem fetching', self.data)
            # if infinite retries is set, always try again
            if not self.options['max_retries']:
                if not self.options['silent']:
                    print('Retrying...')
                return self.fetch()
            # if below retry limit, return recursively and increment counter
            elif retry_counter <= self.options['max_retries']:
                if not self.options['silent']:
                    print('Retrying')
                return self.fetch(retry_counter=retry_counter+1)
            # otherwise retry limit has been hit, stop fetching
            else:
                if not self.options['silent']:
                    print('Retry limit reached, skipping', self.data)
                return None
        # if everything ok, returning page html instead of the entire request
        return request.text

[docs]    def soup(self):
        """Returns a BeautifulSoup object loaded with HTML data from the URL

        Invokes web request then returns a soup object loaded with page HTML.
        Uses html.parser with BeautifulSoup. Child classes may override this
        to use other parsers (e.g. lxml).

        Returns:
            :obj:`bs4.BeautifulSoup` BeautifulSoup object loaded with parsed data from web

        Examples:
            >>> # fetch webpage and parse into BeautifulSoup object
            >>> parsed_webpage = WebPage('https://stackoverflow.com/').soup()
            >>> # grab the logo from the header with BeautifulSoup
            >>> header_logo_text = parsed_webpage.find('header')
            ...    .find('div', class_='-main')
            ...    .find('span', class_='-img')
            >>> # print the text contained in the span tag
            >>> print(header_logo_text.get_text())
            Stack Overflow
        """ # noqa
        return BeautifulSoup(self.fetch(), self.options['parser'])