import logging import requests import re from lxml import html import math ### # Scanner to search for uswds compliance. It is just scraping the front page # and CSS files and searching for particular content. # Set a default number of workers for a particular scan type. # Overridden by a --workers flag. XXX not actually overridden? workers = 50 # Required scan function. This is the meat of the scanner, where things # that use the network or are otherwise expensive would go. # # Runs locally or in the cloud (Lambda). def scan(domain: str, environment: dict, options: dict) -> dict: results = {} for i in headers: results[i] = 0 results['uswdsversion'] = "" # Get the url try: response = requests.get("http://" + domain, timeout=5) except Exception: logging.debug("got error while querying %s", domain) results["domain"] = domain results["status_code"] = -1 return results # check for class.*usa- in body res = re.findall(r'class.*"usa-', response.text) if res: results["usa_classes_detected"] = round(math.sqrt(len(res))) * 5 # # check for official text # # (testing revealed that this generated FPs) # # XXX Try this in the header only? # res = re.findall(r'fficial website of the', response.text) # if res: # results["official_website_detected"] = len(res) # check for uswds in text anywhere res = re.findall(r'uswds', response.text) if res: results["uswds_detected"] = len(res) # check for .usa- in text anywhere res = re.findall(r'\.usa-', response.text) if res: results["usa_detected"] = len(res) # check for favicon-57.png (flag) in text anywhere res = re.findall(r'favicon-57.png', response.text) if res: results["flag_detected"] = 20 # count how many tables are in the, to deduct from the score res = re.findall(r'