Source code for dhelp.web

#!/usr/bin/python

import time

from collections import UserString

import requests
from bs4 import BeautifulSoup


[docs]class WebPage(UserString): """Downloads and parses HTML into BeautifulSoup objects. Provides methods to download/parse a specified webpage. Merges the request package with BeautifulSoup functions to enable users to request/soup a page in a single line. Args: url (:obj:`str`) URL of page you wish to scrape options (:obj:`dict`, optional) Dictionary with keyword/value pairs to set options Examples: >>> from dhelp import WebPage >>> web_page = WebPage('https://stackoverflow.com') >>> print(web_page) 'https://stackoverflow.com' >>> # pass an dict to set options for delay, max_retries, or silent >>> options = { ... 'delay': 4, 'max_retries': 3, 'silent': True 'parser': 'html.parser' ... } >>> web_page = WebPage('https://stackoverflow.com', options=options) https://stackoverflow.com """ # noqa def __init__(self, url, options={}): # call parent constructor super().__init__(str) if type(url) is not str: raise Exception('URL must be a string') if 'delay' not in options: options['delay'] = 2 if 'max_retries' not in options: options['max_retries'] = 0 if 'silent' not in options: options['silent'] = False if 'parser' not in options: options['parser'] = 'html.parser' self.data = url self.options = options def __enter__(self): return self.soup() def __exit__(self, ctx_type, ctx_value, ctx_traceback): if not self.options['silent']: print('Successfully scraped', self.data)
[docs] def fetch(self, retry_counter=0): """Returns http request from URL as a string. Can be called to return HTML data, although not generally meant to be called directly by user. If user calls .fetch(), retry_counter should not be passed so that it will start at 0. This function is intended to be called by .soup() in order to feed its parser. If the request was not successful, .fetch() calls itself recursively until it is either successful, or the maximum number of attempts has been reached. If the .max_retries property is set to 0, .fetch() will make inifinite requests. Args: retry_counter (:obj:`int`) The number of attempts already made to fetch the object. Returns: :obj:`str` HTML from requested URL, in plain text format Examples: >>> html_text = WebPage('https://stackoverflow.com/').fetch() <!DOCTYPE html>\\r\\n<html>\\r\\n\\r\\n <head>\\r\\n\\r\\n <title>Stack Overflow... """ # noqa # print message unless silent option if not self.options['silent']: print('Fetching', self.data) # enforce delay to reduce server load time.sleep(self.options['delay']) # attempt to fetch web page try: request = requests.get(self.data) # if error in getting page, call self recursively to try again except Exception: if not self.options['silent']: print('Problem fetching', self.data) # if infinite retries is set, always try again if not self.options['max_retries']: if not self.options['silent']: print('Retrying...') return self.fetch() # if below retry limit, return recursively and increment counter elif retry_counter <= self.options['max_retries']: if not self.options['silent']: print('Retrying') return self.fetch(retry_counter=retry_counter+1) # otherwise retry limit has been hit, stop fetching else: if not self.options['silent']: print('Retry limit reached, skipping', self.data) return None # if everything ok, returning page html instead of the entire request
return request.text
[docs] def soup(self): """Returns a BeautifulSoup object loaded with HTML data from the URL Invokes web request then returns a soup object loaded with page HTML. Uses html.parser with BeautifulSoup. Child classes may override this to use other parsers (e.g. lxml). Returns: :obj:`bs4.BeautifulSoup` BeautifulSoup object loaded with parsed data from web Examples: >>> # fetch webpage and parse into BeautifulSoup object >>> parsed_webpage = WebPage('https://stackoverflow.com/').soup() >>> # grab the logo from the header with BeautifulSoup >>> header_logo_text = parsed_webpage.find('header') ... .find('div', class_='-main') ... .find('span', class_='-img') >>> # print the text contained in the span tag >>> print(header_logo_text.get_text()) Stack Overflow """ # noqa
return BeautifulSoup(self.fetch(), self.options['parser'])