#!/usr/bin/env python3 import os import glob import sys import re import csv import requests import logging import importlib from utils import utils # some metadata about the scan itself start_time = utils.local_now() start_command = str.join(" ", sys.argv) # Applied if --ignore-www is enabled. strip_www = re.compile("^www\.") # Applied to all domains. strip_protocol = re.compile("^https?://") strip_wildcard = re.compile("^(\*.)+") strip_redacted = re.compile("^(\?\.)+") def run(options=None, cache_dir="./cache", results_dir="./results"): sources = options["gatherers"] suffixes = options.get("suffix") suffix_pattern = utils.suffix_pattern(suffixes) # Clear out existing result CSVs, to avoid inconsistent data. for result in glob.glob("%s/*.csv" % results_dir): os.remove(result) # Opt in to include parent (second-level) domains. include_parents = options.get("include_parents", False) # Opt into stripping www. prefixes from hostnames, effectively # collapsing www.[host] and [host] into one record. ignore_www = options.get("ignore_www", False) # --parents should be a CSV whose first column is parent domains # that will act as a whitelist for which subdomains to gather. parents = get_parent_domains(options, cache_dir=cache_dir) # De-duping hostnames. This will cause the system to hold all # hostnames in memory at once, but oh well. hostnames_cache = {} for source in sources: extra = {} try: gatherer_module = importlib.import_module( "gatherers.%s" % source) gatherer = gatherer_module.Gatherer(suffixes, options, extra) except ImportError: # If it's not a registered module, allow it to be "hot registered" # as long as the user gave us a flag with that name that can be # used as the --url option to the URL module. if options.get(source): gatherer_module = importlib.import_module("gatherers.url") extra['name'] = source gatherer = gatherer_module.Gatherer(suffixes, options, extra) else: exc_type, exc_value, exc_traceback = sys.exc_info() logging.error("[%s] Gatherer not found, or had an error during loading.\n\tERROR: %s\n\t%s" % (source, exc_type, exc_value)) exit(1) # Iterate over each hostname. for domain in gatherer.gather(): # Always apply the suffix filter to returned names. if not suffix_pattern.search(domain): continue # Strip off whitespace before pre-processing. domain = domain.strip() # Cut off protocols, if present. domain = strip_protocol.sub("", domain) # Cut naive wildcard prefixes out. (from certs) domain = strip_wildcard.sub("", domain) # Cut off any redaction markers from names. (from certs) domain = strip_redacted.sub("", domain) # Strip www. prefixes from hostnames, effectively # collapsing www.[host] and [host] into one record. if ignore_www: domain = strip_www.sub("", domain) # Strip off whitespace after pre-processing. domain = domain.strip() base = utils.base_domain_for(domain) # Unless --include-parents is specified, exclude them. if not include_parents: # Always ignore www prefixes for base domains. if (domain == base) or (domain == "www.%s" % base): continue # Apply --parent domain whitelist, if present. if parents: if base not in parents: continue # Use hostname cache to de-dupe, if seen before. if domain not in hostnames_cache: hostnames_cache[domain] = [source] elif source not in hostnames_cache[domain]: hostnames_cache[domain] += [source] # Now that we've gone through all sources and logged when each # domain appears in each one, go through cache and write # all of them to disk. # Assemble headers. headers = ["Domain", "Base Domain"] # Add headers dynamically for each source. headers += sources # Open CSV file. gathered_filename = "%s/%s.csv" % (results_dir, "gathered") gathered_file = open(gathered_filename, 'w', newline='') gathered_writer = csv.writer(gathered_file) gathered_writer.writerow(headers) # Write each hostname to disk, with all discovered sources. hostnames = list(hostnames_cache.keys()) hostnames.sort() for hostname in hostnames: base = utils.base_domain_for(hostname) row = [hostname, base] for source in sources: row += [source in hostnames_cache[hostname]] gathered_writer.writerow(row) # Close CSV file. gathered_file.close() # If sort requested, sort in place by domain. if options.get("sort"): utils.sort_csv(gathered_filename) logging.warning("Results written to CSV.") # Save metadata. end_time = utils.local_now() metadata = { 'start_time': utils.utc_timestamp(start_time), 'end_time': utils.utc_timestamp(end_time), 'command': start_command } utils.write(utils.json_for(metadata), "%s/meta.json" % results_dir) # Read in parent domains from the first column of a given CSV. def get_parent_domains(options, cache_dir="./cache"): parents = options.get("parents") if not parents: return None # If --parents is a URL, we want to download it now, # and then adjust the value to be the path of the cached download. if parents.startswith("http:") or parents.startswith("https:"): # Though it's saved in cache/, it will be downloaded every time. parents_path = os.path.join(cache_dir, "parents.csv") try: response = requests.get(parents) utils.write(response.text, parents_path) except: logging.error("Parent domains URL not downloaded successfully.") print(utils.format_last_exception()) exit(1) parents = parents_path parent_domains = [] with open(parents, encoding='utf-8', newline='') as csvfile: for row in csv.reader(csvfile): if (not row[0]) or (row[0].lower() == "domain") or (row[0].lower() == "domain name"): continue parent_domains.append(row[0].lower()) return parent_domains if __name__ == '__main__': options = utils.options_for_gather() utils.configure_logging(options) # Support --output flag for changing where cache/ and results/ go. cache_dir = utils.cache_dir(options) results_dir = utils.results_dir(options) utils.mkdir_p(cache_dir) utils.mkdir_p(results_dir) run(options, cache_dir=cache_dir, results_dir=results_dir)