import logging import urllib.request from lxml import etree import re import requests ### # Scan focused on learning about the sitemap.xml file, as per # https://github.com/18F/site-scanning/issues/87. # Set a default number of workers for a particular scan type. # Overridden by a --workers flag. XXX not actually overridden? workers = 50 # Required scan function. This is the meat of the scanner, where things # that use the network or are otherwise expensive would go. # # Runs locally or in the cloud (Lambda). def scan(domain: str, environment: dict, options: dict) -> dict: logging.debug("Scan function called with options: %s" % options) results = {} # get status_code and final_url for sitemap.xml try: response = requests.head("https://" + domain + '/sitemap.xml', allow_redirects=True, timeout=4) results['status_code'] = str(response.status_code) results['final_url'] = response.url except Exception: logging.debug("could not get data from %s/sitemap.xml", domain) results['status_code'] = str(-1) results['final_url'] = '' # search sitemap and count the tags url = 'https://' + domain + '/sitemap.xml' i = 0 try: with urllib.request.urlopen(url, timeout=5) as sitemap: for _, element in etree.iterparse(sitemap): tag = etree.QName(element.tag).localname if tag == 'url': i = i + 1 element.clear() except Exception: logging.debug('error while trying to retrieve sitemap.xml') results['url_tag_count'] = i # search robots.txt for sitemap locations url = 'https://' + domain + '/robots.txt' results['sitemap_locations_from_robotstxt'] = [] try: with urllib.request.urlopen(url, timeout=5) as robots: for _, line in enumerate(robots): line = line.decode().rstrip() sitemaps = re.findall('[sS]itemap: (.*)', line) if sitemaps: results['sitemap_locations_from_robotstxt'] = list(set().union(sitemaps, results['sitemap_locations_from_robotstxt'])) except Exception: logging.debug('error while trying to retrieve robots.txt for %s', url) logging.warning("sitemap %s Complete!", domain) return results # Required CSV row conversion function. Usually one row, can be more. # # Run locally. def to_rows(data): row = [] for page in headers: row.extend([data[page]]) return [row] # CSV headers for each row of data. Referenced locally. headers = [ 'status_code', 'final_url', 'url_tag_count', 'sitemap_locations_from_robotstxt', ]