import logging import urllib.request import re import requests ### # Scan focused on learning about the /privacy page, as per # https://github.com/18F/site-scanning/issues/89. # Set a default number of workers for a particular scan type. # Overridden by a --workers flag. XXX not actually overridden? workers = 50 def mergelists(a, b): return list(set().union(a, b)) # Required scan function. This is the meat of the scanner, where things # that use the network or are otherwise expensive would go. # # Runs locally or in the cloud (Lambda). def scan(domain: str, environment: dict, options: dict) -> dict: logging.debug("Scan function called with options: %s" % options) results = {} url = 'https://' + domain + '/privacy' # get status_code for /privacy try: response = requests.head(url, allow_redirects=True, timeout=4) results['status_code'] = str(response.status_code) results['final_url'] = response.url except Exception: logging.debug("could not get data from %s", url) results['status_code'] = str(-1) results['final_url'] = '' # search /privacy for email addresses results['emails'] = [] try: with urllib.request.urlopen(url, timeout=5) as privacypage: for _, line in enumerate(privacypage): line = line.decode().rstrip() emails = re.findall('(.*)', line) h2s = re.findall('

', line) if h1s or h2s or h3s: results['h1'] = mergelists(h1s, results['h1']) results['h2'] = mergelists(h2s, results['h2']) results['h3'] = mergelists(h3s, results['h3']) except Exception: logging.debug('error while trying to retrieve emails from %s', url) logging.warning("sitemap %s Complete!", domain) return results # Required CSV row conversion function. Usually one row, can be more. # # Run locally. def to_rows(data): row = [] for page in headers: row.extend([data[page]]) return [row] # CSV headers for each row of data. Referenced locally. headers = [ 'status_code', 'final_url', 'emails', 'h1', 'h2', 'h3', ]